# Bag of Words

represent a sentence as a bag of words vector (a string of numbers).

Bag of Words just creates a set of vectors containing the count of word occurrences in the document 

In [14]:
import nltk


In [32]:
paragraph =  """ I do not think that I should apologize to you for having to speak in a foreign tongue. I wonder if this loud speaker carries my voice to the farthest end of this vast audience. If some of those who are far away are unable to listen to what I may say, it will be the fault of the loud speaker.
I was going to tell you that I do not wish to apologize. I dare not. You cannot understand the provincial language, which is my mother tongue. I do not want to insult you by speaking in my own language (Gujarati). Our national speech is Hindustani. I know that it will be a long time before it can be made into an international speech. For international commerce, undoubtedly, English occupies the first place. I used to hear that French was the language of diplomacy. I was told, when I was young, that if I wanted to go from one end of Europe to the other, I must try to pick up French. I tried to learn French, in order that I may be able to make myself understood. There is a rivalry between the French and the English. Having been taught English, I have naturally to resort to it.
I was wondering, as to what I was to speak to you. I wanted to collect my thoughts, but, let me confess to you that I had no time. Yet I had promised yesterday that I would try to say a few words. While I was coming with Badshah Khan, I asked for a little piece of paper and pencil. I got a pen, instead of a pencil. I tried to scribble a few words. You will be sorry to hear that piece of paper is not by my side, though I remember what I wanted to say."""
               

In [33]:
# Cleaning the texts
import re  #regular expression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [34]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [35]:
sentences

[' I do not think that I should apologize to you for having to speak in a foreign tongue.',
 'I wonder if this loud speaker carries my voice to the farthest end of this vast audience.',
 'If some of those who are far away are unable to listen to what I may say, it will be the fault of the loud speaker.',
 'I was going to tell you that I do not wish to apologize.',
 'I dare not.',
 'You cannot understand the provincial language, which is my mother tongue.',
 'I do not want to insult you by speaking in my own language (Gujarati).',
 'Our national speech is Hindustani.',
 'I know that it will be a long time before it can be made into an international speech.',
 'For international commerce, undoubtedly, English occupies the first place.',
 'I used to hear that French was the language of diplomacy.',
 'I was told, when I was young, that if I wanted to go from one end of Europe to the other, I must try to pick up French.',
 'I tried to learn French, in order that I may be able to make myself

In [36]:
corpus

['think apolog speak foreign tongu',
 'wonder loud speaker carri voic farthest end vast audienc',
 'far away unabl listen may say fault loud speaker',
 'go tell wish apolog',
 'dare',
 'cannot understand provinci languag mother tongu',
 'want insult speak languag gujarati',
 'nation speech hindustani',
 'know long time made intern speech',
 'intern commerc undoubtedli english occupi first place',
 'use hear french languag diplomaci',
 'told young want go one end europ must tri pick french',
 'tri learn french order may abl make understood',
 'rivalri french english',
 'taught english natur resort',
 'wonder speak',
 'want collect thought let confess time',
 'yet promis yesterday would tri say word',
 'come badshah khan ask littl piec paper pencil',
 'got pen instead pencil',
 'tri scribbl word',
 'sorri hear piec paper side though rememb want say']

In [37]:
# ^ - exccept a-z we substitute everything else with space in the sentences and lower it and split it to convert to list
# do stemming exccept the stopwords
# append to the corpus list

#

In [38]:
corpus1 = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus1.append(review)

In [39]:
corpus1

['think apologize speak foreign tongue',
 'wonder loud speaker carry voice farthest end vast audience',
 'far away unable listen may say fault loud speaker',
 'going tell wish apologize',
 'dare',
 'cannot understand provincial language mother tongue',
 'want insult speaking language gujarati',
 'national speech hindustani',
 'know long time made international speech',
 'international commerce undoubtedly english occupies first place',
 'used hear french language diplomacy',
 'told young wanted go one end europe must try pick french',
 'tried learn french order may able make understood',
 'rivalry french english',
 'taught english naturally resort',
 'wondering speak',
 'wanted collect thought let confess time',
 'yet promised yesterday would try say word',
 'coming badshah khan asked little piece paper pencil',
 'got pen instead pencil',
 'tried scribble word',
 'sorry hear piece paper side though remember wanted say']

In [40]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [41]:
X.shape  # 31 sentences

(22, 92)

In [42]:
X

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)