# Bag of Words Pipeline
1. Get the Data/Corpus
2. Tokenisation,Stopword Removal
3. Stemming
4. Building a Vocab
5. Vectorization
6. Classification

# Tokenization

In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [2]:
sentence="Hello there! My name is Isha. Today is a very pleasant day and the weather is cool. :)"

In [3]:
#getting separate sentences
t1=sent_tokenize(sentence)

In [4]:
print(t1)

['Hello there!', 'My name is Isha.', 'Today is a very pleasant day and the weather is cool.', ':)']


In [5]:
# Getting separate words/special characters
t2=word_tokenize(sentence)

In [6]:
print(t2)

['Hello', 'there', '!', 'My', 'name', 'is', 'Isha', '.', 'Today', 'is', 'a', 'very', 'pleasant', 'day', 'and', 'the', 'weather', 'is', 'cool', '.', ':', ')']


# Stopwords Removal

In [7]:
from nltk.corpus import stopwords

In [8]:
#It contains stopwords from all the languages but we only consider english stopwords
eng_sw=set(stopwords.words('english'))

In [9]:
print(eng_sw)

{'itself', 'a', "won't", 'which', "aren't", "you're", "wasn't", 'on', 'weren', 'to', 'do', 'with', "she's", 'isn', 'down', 'an', 'under', 'then', 'ma', 'here', 'most', 'we', 'had', 'ain', 'yourselves', 'and', 'as', 'being', "shan't", 'some', 'myself', 'for', 'm', 'if', 'didn', 'own', "you'd", 'very', 'yourself', "hasn't", 'can', 'in', 'herself', 'been', 'ours', 'aren', 'that', 'again', 'off', "mightn't", 'needn', 'it', "you've", 'of', 'once', "should've", 'you', 'have', 'my', 'themselves', 'should', "didn't", 'both', 'did', 'will', 'hasn', 'them', 'wasn', 'ourselves', 'into', 'she', 'through', 'there', 'is', 'by', "mustn't", 'mustn', 'until', 'having', 'too', 'above', 'only', "weren't", 'any', 'when', 'than', 'our', 'i', 'at', 'has', "it's", 'where', "that'll", 'before', 'all', 'each', 'were', 'theirs', "doesn't", 'why', 'y', 'against', 'those', 'few', 'whom', 'hadn', "wouldn't", 'about', "hadn't", 'your', 'nor', 'her', 'up', 'how', 'but', 'its', "shouldn't", 'or', "couldn't", "isn't",

In [10]:
def remove_sw(text,sw):
    useful_words=[i for i in text if i not in sw]
    return useful_words

In [11]:
res=remove_sw('I am happy with my life'.split(),eng_sw)

In [12]:
res

['I', 'happy', 'life']

In [13]:
res=remove_sw('Hello there! My name is Isha. Today is a very pleasant day and the weather is cool.'.split(),eng_sw)

In [14]:
print(res)

['Hello', 'there!', 'My', 'name', 'Isha.', 'Today', 'pleasant', 'day', 'weather', 'cool.']


# Tokenisation using Regular Expression
https://www.regexpal.com/

In [15]:
sentence="Hello there! Mic testing.. 1, 2, 3 Great its working"

In [16]:
from nltk.tokenize import RegexpTokenizer

In [17]:
tokenizer=RegexpTokenizer('[a-zA-Z@.]+')
useful_text=tokenizer.tokenize(sentence)

In [18]:
useful_text

['Hello', 'there', 'Mic', 'testing..', 'Great', 'its', 'working']

# Stemming

Three types of Stemmer:
    
    1. Snowball
    2.Porter
    3.Lancaster

In [19]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [20]:
# create object of a stemmer
Ps=PorterStemmer()

In [21]:
Ps.stem('jumping')

'jump'

In [22]:
Ps.stem('lovely')

'love'

In [23]:
Ps.stem('sitting')

'sit'

In [24]:
# Multi-language Stemmer - specify lamguage
Ss=SnowballStemmer('english')

In [25]:
Ss.stem('sitting')

'sit'

# Lemmatization


In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
Wn=WordNetLemmatizer()

In [28]:
Wn.lemmatize('watching')

'watching'

# Building a Vocab and Vectorization

In [29]:
# adventure,mystry,news,romance
sample_corpus=[
    "It could be some kind of trick Budd had thought up.",
    "I'm calling you , Mr. Nelson , at the request of Mr. Phillip Wycoff.",
    "The senate quickly whipped through its meager fare of House bills approved by committees,passing the three on the calendar.",
    "He expected Concetta's thin hand to reach down to grasp the boy,and her shrill,impetuous voice to sound against the rotundity of his disfigured flesh that was never sure of hearing anything."
]

In [30]:
#CountVectorizer: 
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
Cv=CountVectorizer()

In [32]:
vectorized_corpus=Cv.fit_transform(sample_corpus)

In [33]:
print(vectorized_corpus)

  (0, 29)	1
  (0, 14)	1
  (0, 5)	1
  (0, 46)	1
  (0, 31)	1
  (0, 36)	1
  (0, 56)	1
  (0, 8)	1
  (0, 21)	1
  (0, 52)	1
  (0, 57)	1
  (1, 36)	1
  (1, 11)	1
  (1, 62)	1
  (1, 33)	2
  (1, 34)	1
  (1, 4)	1
  (1, 50)	1
  (1, 42)	1
  (1, 39)	1
  (1, 61)	1
  (2, 36)	1
  (2, 50)	3
  (2, 44)	1
  (2, 40)	1
  :	:
  (3, 13)	1
  (3, 51)	1
  (3, 22)	1
  (3, 55)	3
  (3, 41)	1
  (3, 16)	1
  (3, 20)	1
  (3, 7)	1
  (3, 1)	1
  (3, 25)	1
  (3, 45)	1
  (3, 28)	1
  (3, 58)	1
  (3, 47)	1
  (3, 0)	1
  (3, 43)	1
  (3, 26)	1
  (3, 15)	1
  (3, 19)	1
  (3, 49)	1
  (3, 59)	1
  (3, 35)	1
  (3, 48)	1
  (3, 24)	1
  (3, 2)	1


In [34]:
vectorized_corpus.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 1,
        0, 1, 0, 1, 1, 1, 2, 1, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0]],
      dtype=int64)

In [35]:
#first line
print("Original: ",sample_corpus[0])
print("Vectorized form: ",vectorized_corpus.toarray()[0])

Original:  It could be some kind of trick Budd had thought up.
Vectorized form:  [0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0]


In [36]:
#mapping between unique words in dictonary and number assigned to them
print(Cv.vocabulary_)

{'it': 29, 'could': 14, 'be': 5, 'some': 46, 'kind': 31, 'of': 36, 'trick': 56, 'budd': 8, 'had': 21, 'thought': 52, 'up': 57, 'calling': 11, 'you': 62, 'mr': 33, 'nelson': 34, 'at': 4, 'the': 50, 'request': 42, 'phillip': 39, 'wycoff': 61, 'senate': 44, 'quickly': 40, 'whipped': 60, 'through': 54, 'its': 30, 'meager': 32, 'fare': 18, 'house': 27, 'bills': 6, 'approved': 3, 'by': 9, 'committees': 12, 'passing': 38, 'three': 53, 'on': 37, 'calendar': 10, 'he': 23, 'expected': 17, 'concetta': 13, 'thin': 51, 'hand': 22, 'to': 55, 'reach': 41, 'down': 16, 'grasp': 20, 'boy': 7, 'and': 1, 'her': 25, 'shrill': 45, 'impetuous': 28, 'voice': 58, 'sound': 47, 'against': 0, 'rotundity': 43, 'his': 26, 'disfigured': 15, 'flesh': 19, 'that': 49, 'was': 59, 'never': 35, 'sure': 48, 'hearing': 24, 'anything': 2}


In [37]:
# They are same!
print("Size of Vocab: ",len(Cv.vocabulary_.keys()))
print("Size of Vectorized sentence: ",len(vectorized_corpus.toarray()[0]))


Size of Vocab:  63
Size of Vectorized sentence:  63


# Reverse Mapping

In [38]:
vectorized_corpus=vectorized_corpus.toarray()

In [39]:
numerica_form=vectorized_corpus[2]

In [40]:
numerica_form

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [41]:
#generate sentence out of numeric form
# words are jumbled - bag of words
Cv.inverse_transform(numerica_form)

[array(['approved', 'bills', 'by', 'calendar', 'committees', 'fare',
        'house', 'its', 'meager', 'of', 'on', 'passing', 'quickly',
        'senate', 'the', 'three', 'through', 'whipped'], dtype='<U10')]

In [42]:
print(sample_corpus[2])

The senate quickly whipped through its meager fare of House bills approved by committees,passing the three on the calendar.


# Vectorization with Stopwords Removal

In [43]:
print(sentence)

Hello there! Mic testing.. 1, 2, 3 Great its working


In [44]:
def myToken(text):
    words=tokenizer.tokenize(text.lower())
    #Indian and indian considered same
    
    words=remove_sw(words,eng_sw)
    #Remove Stopwords
    
    return words

In [45]:
myToken(sentence)

['hello', 'mic', 'testing..', 'great', 'working']

In [46]:
Cv1=CountVectorizer(tokenizer=myToken)

In [47]:
vectorized_corpus1=Cv1.fit_transform(sample_corpus).toarray()

In [48]:
vectorized_corpus1

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
        1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0]],
      dtype=int64)

In [49]:
print(len(vectorized_corpus1[0]))

42


In [50]:
print(len(vectorized_corpus[0]))

63


In [51]:
# Length Reduced from 63 to 42 cuz of stopwords Removal

In [52]:
Cv1.inverse_transform(vectorized_corpus1) 

[array(['budd', 'could', 'kind', 'thought', 'trick', 'up.'], dtype='<U10'),
 array(['calling', 'mr.', 'nelson', 'phillip', 'request', 'wycoff.'],
       dtype='<U10'),
 array(['approved', 'bills', 'calendar.', 'committees', 'fare', 'house',
        'meager', 'passing', 'quickly', 'senate', 'three', 'whipped'],
       dtype='<U10'),
 array(['anything.', 'boy', 'concetta', 'disfigured', 'expected', 'flesh',
        'grasp', 'hand', 'hearing', 'impetuous', 'never', 'reach',
        'rotundity', 'shrill', 'sound', 'sure', 'thin', 'voice'],
       dtype='<U10')]

In [53]:
Cv.inverse_transform(vectorized_corpus)

[array(['be', 'budd', 'could', 'had', 'it', 'kind', 'of', 'some',
        'thought', 'trick', 'up'], dtype='<U10'),
 array(['at', 'calling', 'mr', 'nelson', 'of', 'phillip', 'request', 'the',
        'wycoff', 'you'], dtype='<U10'),
 array(['approved', 'bills', 'by', 'calendar', 'committees', 'fare',
        'house', 'its', 'meager', 'of', 'on', 'passing', 'quickly',
        'senate', 'the', 'three', 'through', 'whipped'], dtype='<U10'),
 array(['against', 'and', 'anything', 'boy', 'concetta', 'disfigured',
        'down', 'expected', 'flesh', 'grasp', 'hand', 'he', 'hearing',
        'her', 'his', 'impetuous', 'never', 'of', 'reach', 'rotundity',
        'shrill', 'sound', 'sure', 'that', 'the', 'thin', 'to', 'voice',
        'was'], dtype='<U10')]

In [54]:
# Less words in Cv1 than original Cv

# Classification Concepts
'good movie' needs to be separated from 'not good movie'

# Creating Features in multiple ways
- Unigram: every word as a feature
- Bigram: combine two words as a single feature
- Trigram: combine three words as a single feature
- ngrams: Combination of all three mentioned above

In [55]:
#Bigram feature: inc size of vector but allow negation consideration
Cv_bi=CountVectorizer(ngram_range=(2,2))

In [56]:
sample_corpus

['It could be some kind of trick Budd had thought up.',
 "I'm calling you , Mr. Nelson , at the request of Mr. Phillip Wycoff.",
 'The senate quickly whipped through its meager fare of House bills approved by committees,passing the three on the calendar.',
 "He expected Concetta's thin hand to reach down to grasp the boy,and her shrill,impetuous voice to sound against the rotundity of his disfigured flesh that was never sure of hearing anything."]

In [57]:
Cv_bi.fit_transform(sample_corpus).toarray()

array([[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
        0, 1, 1, 0, 0]], dtype=int64)

In [58]:
#Two features are combined
print(Cv_bi.vocabulary_)

{'it could': 27, 'could be': 12, 'be some': 4, 'some kind': 49, 'kind of': 29, 'of trick': 39, 'trick budd': 66, 'budd had': 7, 'had thought': 19, 'thought up': 60, 'calling you': 9, 'you mr': 70, 'mr nelson': 31, 'nelson at': 33, 'at the': 3, 'the request': 55, 'request of': 45, 'of mr': 38, 'mr phillip': 32, 'phillip wycoff': 42, 'the senate': 57, 'senate quickly': 47, 'quickly whipped': 43, 'whipped through': 69, 'through its': 62, 'its meager': 28, 'meager fare': 30, 'fare of': 16, 'of house': 37, 'house bills': 25, 'bills approved': 5, 'approved by': 2, 'by committees': 8, 'committees passing': 10, 'passing the': 41, 'the three': 58, 'three on': 61, 'on the': 40, 'the calendar': 54, 'he expected': 21, 'expected concetta': 15, 'concetta thin': 11, 'thin hand': 59, 'hand to': 20, 'to reach': 64, 'reach down': 44, 'down to': 14, 'to grasp': 63, 'grasp the': 18, 'the boy': 53, 'boy and': 6, 'and her': 1, 'her shrill': 23, 'shrill impetuous': 48, 'impetuous voice': 26, 'voice to': 67, 

In [59]:
#Trigram feature: inc size of vector but allow negation consideration
Cv_tri=CountVectorizer(ngram_range=(3,3))

In [60]:
Cv_tri.fit_transform(sample_corpus).toarray()

array([[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        0],
       [1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
        0]], dtype=int64)

In [61]:
#Three features are combined
print(Cv_tri.vocabulary_)

{'it could be': 26, 'could be some': 12, 'be some kind': 4, 'some kind of': 47, 'kind of trick': 28, 'of trick budd': 38, 'trick budd had': 62, 'budd had thought': 7, 'had thought up': 19, 'calling you mr': 9, 'you mr nelson': 66, 'mr nelson at': 30, 'nelson at the': 32, 'at the request': 3, 'the request of': 52, 'request of mr': 43, 'of mr phillip': 37, 'mr phillip wycoff': 31, 'the senate quickly': 54, 'senate quickly whipped': 45, 'quickly whipped through': 41, 'whipped through its': 65, 'through its meager': 58, 'its meager fare': 27, 'meager fare of': 29, 'fare of house': 16, 'of house bills': 36, 'house bills approved': 24, 'bills approved by': 5, 'approved by committees': 2, 'by committees passing': 8, 'committees passing the': 10, 'passing the three': 40, 'the three on': 55, 'three on the': 57, 'on the calendar': 39, 'he expected concetta': 21, 'expected concetta thin': 15, 'concetta thin hand': 11, 'thin hand to': 56, 'hand to reach': 20, 'to reach down': 60, 'reach down to': 

In [62]:
#ngram feature
Cv_ng=CountVectorizer(ngram_range=(1,3))

In [63]:
Cv_ng.fit_transform(sample_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 

In [64]:
# All three types present
print(Cv_ng.vocabulary_)

{'it': 82, 'could': 38, 'be': 13, 'some': 142, 'kind': 88, 'of': 105, 'trick': 184, 'budd': 22, 'had': 59, 'thought': 169, 'up': 187, 'it could': 83, 'could be': 39, 'be some': 14, 'some kind': 143, 'kind of': 89, 'of trick': 114, 'trick budd': 185, 'budd had': 23, 'had thought': 60, 'thought up': 170, 'it could be': 84, 'could be some': 40, 'be some kind': 15, 'some kind of': 144, 'kind of trick': 90, 'of trick budd': 115, 'trick budd had': 186, 'budd had thought': 24, 'had thought up': 61, 'calling': 29, 'you': 198, 'mr': 94, 'nelson': 99, 'at': 10, 'the': 154, 'request': 130, 'phillip': 122, 'wycoff': 197, 'calling you': 30, 'you mr': 199, 'mr nelson': 95, 'nelson at': 100, 'at the': 11, 'the request': 158, 'request of': 131, 'of mr': 112, 'mr phillip': 97, 'phillip wycoff': 123, 'calling you mr': 31, 'you mr nelson': 200, 'mr nelson at': 96, 'nelson at the': 101, 'at the request': 12, 'the request of': 159, 'request of mr': 132, 'of mr phillip': 113, 'mr phillip wycoff': 98, 'senat

In [65]:
# These ways require more memory as vocab is bigger and hence vector length also increases

# TF-iDF Normalization
TF: Term Frequency and,

iDF: inverse Document Frequency

Avoid features that occur very often - contain less info (info dec as no of occ inc)

Example:

'the' can occur a lot irrespective of the corpus category but 
On the other hand...

'cricket' can occur a lot in cricket category but will occur very less in other categories.




In [66]:
 from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
corpus=[
    'this is a good movie',
    'this was a good movie',
    'this is not a good movie'
]

In [68]:
tf_idf=TfidfVectorizer()

In [69]:
Vc=tf_idf.fit_transform(corpus).toarray()

In [70]:
tf_idf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}

In [71]:
Vc[2]

array([0.3645444 , 0.46941728, 0.3645444 , 0.61722732, 0.3645444 ,
       0.        ])

In [72]:
#'not' has index as 3 so the value at 3rd index is highest!