# Bag of Words Pipeline
1. Get the Data/Corpus
2. Tokenisation,Stopword Removal
3. Stemming
4. Building a Vocab
5. Vectorization
6. Classification

# Tokenization

In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [2]:
sentence="Hello there! My name is Isha. Today is a very pleasant day and the weather is cool. :)"

In [3]:
#getting separate sentences
t1=sent_tokenize(sentence)

In [4]:
print(t1)

['Hello there!', 'My name is Isha.', 'Today is a very pleasant day and the weather is cool.', ':)']


In [5]:
# Getting separate words/special characters
t2=word_tokenize(sentence)

In [6]:
print(t2)

['Hello', 'there', '!', 'My', 'name', 'is', 'Isha', '.', 'Today', 'is', 'a', 'very', 'pleasant', 'day', 'and', 'the', 'weather', 'is', 'cool', '.', ':', ')']


# Stopwords Removal

In [7]:
from nltk.corpus import stopwords

In [8]:
#It contains stopwords from all the languages but we only consider english stopwords
eng_sw=set(stopwords.words('english'))

In [9]:
print(eng_sw)

{'all', 'when', "mustn't", 's', "don't", 'its', 'they', 'my', 'these', 'his', 'from', 'did', "couldn't", 'so', 'as', 'such', 'those', 'no', 'while', 'why', 'it', 've', 'than', "hadn't", 'yourselves', 'herself', 'some', 'and', 'against', 'does', 'further', 'in', "aren't", "you'd", 'by', 'nor', 'not', "won't", "you're", 'himself', 'whom', 'has', 'themselves', 'couldn', 'wasn', 'had', 'up', 'were', 'doesn', 'having', "mightn't", 'most', "shouldn't", "wouldn't", 'she', 'because', 'now', "that'll", "hasn't", 're', 'there', 'out', 'both', 'which', 'below', 'other', 'here', 'm', 'the', 'theirs', 'over', 'at', 'each', 'on', 'again', 'once', 'hasn', 'him', 'off', 'be', 'mightn', 'is', 'myself', 'a', 'll', 'any', 'd', 'wouldn', 'was', 'between', "it's", 'if', 'do', 'about', 'that', 'an', 'are', 'your', "you've", 'above', 'more', 'can', "you'll", 'few', 'i', "doesn't", 'our', 'itself', 'their', 'being', 'don', 'isn', "haven't", 'them', 'ours', "she's", 'after', 't', 'or', 'you', 'shouldn', 'didn'

In [10]:
def remove_sw(text,sw):
    useful_words=[i for i in text if i not in sw]
    return useful_words

In [11]:
res=remove_sw('I am happy with my life'.split(),eng_sw)

In [12]:
res

['I', 'happy', 'life']

In [13]:
res=remove_sw('Hello there! My name is Isha. Today is a very pleasant day and the weather is cool.'.split(),eng_sw)

In [14]:
print(res)

['Hello', 'there!', 'My', 'name', 'Isha.', 'Today', 'pleasant', 'day', 'weather', 'cool.']


# Tokenisation using Regular Expression
https://www.regexpal.com/

In [15]:
sentence="Hello there! Mic testing.. 1, 2, 3 Great its working"

In [16]:
from nltk.tokenize import RegexpTokenizer

In [17]:
tokenizer=RegexpTokenizer('[a-zA-Z@.]+')
useful_text=tokenizer.tokenize(sentence)

In [18]:
useful_text

['Hello', 'there', 'Mic', 'testing..', 'Great', 'its', 'working']

# Stemming

Three types of Stemmer:
    
    1. Snowball
    2.Porter
    3.Lancaster

In [19]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [20]:
# create object of a stemmer
Ps=PorterStemmer()

In [21]:
Ps.stem('jumping')

'jump'

In [22]:
Ps.stem('lovely')

'love'

In [23]:
Ps.stem('sitting')

'sit'

In [24]:
# Multi-language Stemmer - specify lamguage
Ss=SnowballStemmer('english')

In [25]:
Ss.stem('sitting')

'sit'

# Lemmatization


In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
Wn=WordNetLemmatizer()

In [29]:
Wn.lemmatize('watching')

'watching'

# Building a Vocab and Vectorization

In [31]:
# adventure,mystry,news,romance
sample_corpus=[
    "It could be some kind of trick Budd had thought up.",
    "I'm calling you , Mr. Nelson , at the request of Mr. Phillip Wycoff.",
    "The senate quickly whipped through its meager fare of House bills approved by committees,passing the three on the calendar.",
    "He expected Concetta's thin hand to reach down to grasp the boy,and her shrill,impetuous voice to sound against the rotundity of his disfigured flesh that was never sure of hearing anything."
]

In [32]:
#CountVectorizer: 
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
Cv=CountVectorizer()

In [35]:
vectorized_corpus=Cv.fit_transform(sample_corpus)

In [38]:
print(vectorized_corpus)

  (0, 29)	1
  (0, 14)	1
  (0, 5)	1
  (0, 46)	1
  (0, 31)	1
  (0, 36)	1
  (0, 56)	1
  (0, 8)	1
  (0, 21)	1
  (0, 52)	1
  (0, 57)	1
  (1, 36)	1
  (1, 11)	1
  (1, 62)	1
  (1, 33)	2
  (1, 34)	1
  (1, 4)	1
  (1, 50)	1
  (1, 42)	1
  (1, 39)	1
  (1, 61)	1
  (2, 36)	1
  (2, 50)	3
  (2, 44)	1
  (2, 40)	1
  :	:
  (3, 13)	1
  (3, 51)	1
  (3, 22)	1
  (3, 55)	3
  (3, 41)	1
  (3, 16)	1
  (3, 20)	1
  (3, 7)	1
  (3, 1)	1
  (3, 25)	1
  (3, 45)	1
  (3, 28)	1
  (3, 58)	1
  (3, 47)	1
  (3, 0)	1
  (3, 43)	1
  (3, 26)	1
  (3, 15)	1
  (3, 19)	1
  (3, 49)	1
  (3, 59)	1
  (3, 35)	1
  (3, 48)	1
  (3, 24)	1
  (3, 2)	1


In [39]:
vectorized_corpus.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 1,
        0, 1, 0, 1, 1, 1, 2, 1, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0]],
      dtype=int64)

In [43]:
#first line
print("Original: ",sample_corpus[0])
print("Vectorized form: ",vectorized_corpus.toarray()[0])

Original:  It could be some kind of trick Budd had thought up.
Vectorized form:  [0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0]


In [47]:
#mapping between unique words in dictonary and number assigned to them
print(Cv.vocabulary_)

{'it': 29, 'could': 14, 'be': 5, 'some': 46, 'kind': 31, 'of': 36, 'trick': 56, 'budd': 8, 'had': 21, 'thought': 52, 'up': 57, 'calling': 11, 'you': 62, 'mr': 33, 'nelson': 34, 'at': 4, 'the': 50, 'request': 42, 'phillip': 39, 'wycoff': 61, 'senate': 44, 'quickly': 40, 'whipped': 60, 'through': 54, 'its': 30, 'meager': 32, 'fare': 18, 'house': 27, 'bills': 6, 'approved': 3, 'by': 9, 'committees': 12, 'passing': 38, 'three': 53, 'on': 37, 'calendar': 10, 'he': 23, 'expected': 17, 'concetta': 13, 'thin': 51, 'hand': 22, 'to': 55, 'reach': 41, 'down': 16, 'grasp': 20, 'boy': 7, 'and': 1, 'her': 25, 'shrill': 45, 'impetuous': 28, 'voice': 58, 'sound': 47, 'against': 0, 'rotundity': 43, 'his': 26, 'disfigured': 15, 'flesh': 19, 'that': 49, 'was': 59, 'never': 35, 'sure': 48, 'hearing': 24, 'anything': 2}


In [52]:
# They are same!
print("Size of Vocab: ",len(Cv.vocabulary_.keys()))
print("Size of Vectorized sentence: ",len(vectorized_corpus.toarray()[0]))


Size of Vocab:  63
Size of Vectorized sentence:  63


# Reverse Mapping

In [55]:
vectorized_corpus=vectorized_corpus.toarray()

In [56]:
numerica_form=vectorized_corpus[2]

In [57]:
numerica_form

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [59]:
#generate sentence out of numeric form
# words are jumbled - bag of words
Cv.inverse_transform(numerica_form)

[array(['approved', 'bills', 'by', 'calendar', 'committees', 'fare',
        'house', 'its', 'meager', 'of', 'on', 'passing', 'quickly',
        'senate', 'the', 'three', 'through', 'whipped'], dtype='<U10')]

In [60]:
print(sample_corpus[2])

The senate quickly whipped through its meager fare of House bills approved by committees,passing the three on the calendar.


# Vectorization with Stopwords Removal

In [66]:
print(sentence)

Hello there! Mic testing.. 1, 2, 3 Great its working


In [64]:
def myToken(text):
    words=tokenizer.tokenize(text.lower())
    #Indian and indian considered same
    
    words=remove_sw(words,eng_sw)
    #Remove Stopwords
    
    return words

In [67]:
myToken(sentence)

['hello', 'mic', 'testing..', 'great', 'working']

In [68]:
Cv1=CountVectorizer(tokenizer=myToken)

In [69]:
vectorized_corpus1=Cv1.fit_transform(sample_corpus).toarray()

In [70]:
vectorized_corpus1

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
        1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0]],
      dtype=int64)

In [73]:
print(len(vectorized_corpus1[0]))

42


In [74]:
print(len(vectorized_corpus[0]))

63


In [None]:
# Length Reduced from 63 to 42 cuz of stopwords Removal