# How Vectors work in NLP?

In [1]:
# for Python 2: use print only as a function
from __future__ import print_function
# For analysis
import pandas as pd
import numpy as np
# For ML
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def CountVectorizer(text_documents):
    word_values = {}
    for token in most_freq:
        sent_vector = []
        for document in corpus:
            doc_freq = 0
            for word in nltk.word_tokenize(document):
                if token == word:
                      doc_freq += 1
            word_ = doc_freq/len(nltk.word_tokenize(document))
            sent_vector.append(word_)
        word_values[token] = sent_vector
    return text_documents

In [3]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
simple_train

['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

In [4]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [5]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

CountVectorizer()

In [6]:
# examine the fitted vocabulary
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

This is our whole vocabulary. Also called as Corpus in NLP literature

In [7]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [8]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [9]:
#for comparison

simple_train

['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

In [10]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


### Example

In [11]:
# example text for model testing
simple_test = ["please do not call me"]

In [12]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [13]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


**Notice "do not" is not in our vector.**

Summary:

- vect.fit(train) learns the vocabulary of the training data
- vect.transform(train) uses the fitted vocabulary to build a document-term matrix from the training data
- vect.transform(test) uses the fitted vocabulary to build a document-term matrix from the testing data (and ignores tokens it hasn't seen before)


# TFIDF

- to reflect how important a word is to a document in a collection or corpus

In [14]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [15]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [16]:
print(bowA)

['The', 'cat', 'sat', 'on', 'my', 'face']


In [17]:
wordSet = set(bowA).union(set(bowB))
wordSet

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

In [18]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)

In [19]:
wordDictA

{'dog': 0, 'sat': 0, 'on': 0, 'bed': 0, 'The': 0, 'face': 0, 'cat': 0, 'my': 0}

In [20]:
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [21]:
wordDictA

{'dog': 0, 'sat': 1, 'on': 1, 'bed': 0, 'The': 1, 'face': 1, 'cat': 1, 'my': 1}

In [22]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,dog,sat,on,bed,The,face,cat,my
0,0,1,1,0,1,1,1,1
1,1,1,1,1,1,0,0,1


In [23]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [24]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [25]:
tfBowA

{'dog': 0.0,
 'sat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'bed': 0.0,
 'The': 0.16666666666666666,
 'face': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'my': 0.16666666666666666}

In [26]:
tfBowB

{'dog': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'bed': 0.16666666666666666,
 'The': 0.16666666666666666,
 'face': 0.0,
 'cat': 0.0,
 'my': 0.16666666666666666}

In [27]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [28]:
idfs = computeIDF([wordDictA, wordDictB])

In [29]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [30]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [31]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,dog,sat,on,bed,The,face,cat,my
0,0.0,0.0,0.0,0.0,0.0,0.050172,0.050172,0.0
1,0.050172,0.0,0.0,0.050172,0.0,0.0,0.0,0.0
