In [1]:
import math
import itertools
import pandas as pd
import numpy as np

## Create Documents

In [2]:
docA = "I love NLP"
docB = "Today I learnt NLP"
docC = "AI is the future"
docD = "I like Machine Learning"
docE = "I went to my university yesterday"
docs = [docA, docB, docC, docD, docE]

## Convert text into vectors of numbers
### Bag Of Words

In [3]:
bagOfWordsA = docA.split(" ")
bagOfWordsB = docB.split(" ")
bagOfWordsC = docC.split(" ")
bagOfWordsD = docD.split(" ")
bagOfWordsE = docE.split(" ")

### Remove duplicated words

In [4]:
bagOfWords = [bagOfWordsA, bagOfWordsB, bagOfWordsC, bagOfWordsD, bagOfWordsE]
words = set(itertools.chain(*bagOfWords))
words

{'AI',
 'I',
 'Learning',
 'Machine',
 'NLP',
 'Today',
 'future',
 'is',
 'learnt',
 'like',
 'love',
 'my',
 'the',
 'to',
 'university',
 'went',
 'yesterday'}

## Count the number of each word

### Store the number of words using dictionary

In [5]:
wordDictA = dict.fromkeys(words,0)
wordDictB = dict.fromkeys(words,0)
wordDictC = dict.fromkeys(words,0)
wordDictD = dict.fromkeys(words,0)
wordDictE = dict.fromkeys(words,0)
wordDict = [wordDictA,wordDictB, wordDictC, wordDictD, wordDictE]

In [6]:
def countWords(wordDict,bagOfWord):
    for w in bagOfWord:
        wordDict[w]+= 1
    return wordDict

In [7]:
for i in range(len(wordDict)):
    wordDict[i] = countWords(wordDict[i],bagOfWords[i])   

## Calculate TF-IDF
### Calculate TF
TF = (num of the word w) / (num of total words)

In [8]:
def countTF(wordDict,bagOfWord):
    tfDict={}
    for word,count in wordDict.items():
        tfDict[word] = count/len(bagOfWord)
    return tfDict

In [9]:
tfA = countTF(wordDictA,bagOfWordsA)
tfB = countTF(wordDictB,bagOfWordsB)
tfC = countTF(wordDictC,bagOfWordsC)
tfD = countTF(wordDictD,bagOfWordsD)
tfE = countTF(wordDictE,bagOfWordsE)

### Calculate IDF
IDF = log((num of docs/(num of docs contains w + 1))

In [10]:
def calculateIDF(wordDict):
    idfDict=dict.fromkeys(wordDict[0],0)
    N = len(wordDict)
    for wd in wordDict:
        for w,i in wd.items():
            if i>0:
                idfDict[w] += 1 

    for word,num in idfDict.items():
        idfDict[word]= np.log(N/(num+1))
    
    return idfDict

In [11]:
idf = calculateIDF(wordDict)
idf

{'my': 0.9162907318741551,
 'university': 0.9162907318741551,
 'love': 0.9162907318741551,
 'AI': 0.9162907318741551,
 'like': 0.9162907318741551,
 'learnt': 0.9162907318741551,
 'NLP': 0.5108256237659907,
 'future': 0.9162907318741551,
 'Learning': 0.9162907318741551,
 'yesterday': 0.9162907318741551,
 'I': 0.0,
 'Today': 0.9162907318741551,
 'to': 0.9162907318741551,
 'is': 0.9162907318741551,
 'Machine': 0.9162907318741551,
 'the': 0.9162907318741551,
 'went': 0.9162907318741551}

### Calculate TF-IDF
TF-IDF = TF x IDF

In [12]:
def tfIdf(tf,idf):
    tfidf={}
    for word,num in tf.items():
        tfidf[word]= num * idf[word]
    return tfidf     

In [13]:
tfidfA = tfIdf(tfA,idf)
tfidfB = tfIdf(tfB,idf)
tfidfC = tfIdf(tfC,idf)
tfidfD = tfIdf(tfD,idf)
tfidfE = tfIdf(tfE,idf)

tfidf = [tfidfA,tfidfB, tfidfC, tfidfD, tfidfE]

In [14]:
tfidf_pd = pd.DataFrame(tfidf)
tfidf_pd

Unnamed: 0,my,university,love,AI,like,learnt,NLP,future,Learning,yesterday,I,Today,to,is,Machine,the,went
0,0.0,0.0,0.30543,0.0,0.0,0.0,0.170275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.229073,0.127706,0.0,0.0,0.0,0.0,0.229073,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.229073,0.0,0.0,0.0,0.229073,0.0,0.0,0.0,0.0,0.0,0.229073,0.0,0.229073,0.0
3,0.0,0.0,0.0,0.0,0.229073,0.0,0.0,0.0,0.229073,0.0,0.0,0.0,0.0,0.0,0.229073,0.0,0.0
4,0.152715,0.152715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152715,0.0,0.0,0.152715,0.0,0.0,0.0,0.152715


## Calculate Cosine Distance

In [15]:
def cosDistance(sentence, tfIdf):
    return np.dot(sentence,tfIdf)/(np.linalg.norm(sentence) * np.linalg.norm(tfIdf))

## Sample Test

In [18]:
sentence = "AI is super fanstastic"
bagOfWordsS = sentence.split(" ")

In [19]:
bagOfWords_new = bagOfWords.copy()
bagOfWords_new.append(bagOfWordsS)

In [20]:
bagOfWords_new

[['I', 'love', 'NLP'],
 ['Today', 'I', 'learnt', 'NLP'],
 ['AI', 'is', 'the', 'future'],
 ['I', 'like', 'Machine', 'Learning'],
 ['I', 'went', 'to', 'my', 'university', 'yesterday'],
 ['AI', 'is', 'super', 'fanstastic']]

In [22]:
words_new = set(itertools.chain(*bagOfWords_new))
words_new

{'AI',
 'I',
 'Learning',
 'Machine',
 'NLP',
 'Today',
 'fanstastic',
 'future',
 'is',
 'learnt',
 'like',
 'love',
 'my',
 'super',
 'the',
 'to',
 'university',
 'went',
 'yesterday'}

In [23]:
wordDict_new = wordDict.copy()
for i in range(len(wordDict_new)):
    wordDict_new[i] = dict.fromkeys(words_new,0)
wordDictS = dict.fromkeys(words_new,0)
wordDict_new.append(wordDictS)

In [27]:
for i in range(len(wordDict_new)):
    wordDict_new[i] = countWords(wordDict_new[i],bagOfWords_new[i]) 

In [28]:
wordDict_new

[{'love': 1,
  'is': 0,
  'Machine': 0,
  'the': 0,
  'went': 0,
  'university': 0,
  'AI': 0,
  'like': 0,
  'learnt': 0,
  'Today': 0,
  'future': 0,
  'my': 0,
  'yesterday': 0,
  'NLP': 1,
  'Learning': 0,
  'super': 0,
  'fanstastic': 0,
  'I': 1,
  'to': 0},
 {'love': 0,
  'is': 0,
  'Machine': 0,
  'the': 0,
  'went': 0,
  'university': 0,
  'AI': 0,
  'like': 0,
  'learnt': 1,
  'Today': 1,
  'future': 0,
  'my': 0,
  'yesterday': 0,
  'NLP': 1,
  'Learning': 0,
  'super': 0,
  'fanstastic': 0,
  'I': 1,
  'to': 0},
 {'love': 0,
  'is': 1,
  'Machine': 0,
  'the': 1,
  'went': 0,
  'university': 0,
  'AI': 1,
  'like': 0,
  'learnt': 0,
  'Today': 0,
  'future': 1,
  'my': 0,
  'yesterday': 0,
  'NLP': 0,
  'Learning': 0,
  'super': 0,
  'fanstastic': 0,
  'I': 0,
  'to': 0},
 {'love': 0,
  'is': 0,
  'Machine': 1,
  'the': 0,
  'went': 0,
  'university': 0,
  'AI': 0,
  'like': 1,
  'learnt': 0,
  'Today': 0,
  'future': 0,
  'my': 0,
  'yesterday': 0,
  'NLP': 0,
  'Learning'

In [29]:

tfidf_new = tfidf.copy()
for i in range(len(tfidf_new)):
    tfidf_new[i] = countTF(wordDict_new[i],bagOfWords_new[i])
tfS = countTF(wordDictS,bagOfWordsS)
idfS = calculateIDF(wordDict_new)
tfidfS = tfIdf(tfS,idfS)
    

In [32]:
pd.DataFrame(tfidf_new)

Unnamed: 0,love,is,Machine,the,went,university,AI,like,learnt,Today,future,my,yesterday,NLP,Learning,super,fanstastic,I,to
0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0
2,0.0,0.25,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0
4,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667


In [39]:
tfidf_new.append(tfidfS)
tfidf_new_pd = pd.DataFrame(tfidf_new)
tfidf_new_pd

Unnamed: 0,love,is,Machine,the,went,university,AI,like,learnt,Today,future,my,yesterday,NLP,Learning,super,fanstastic,I,to
0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0
2,0.0,0.25,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0
4,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667
5,0.0,0.173287,0.0,0.0,0.0,0.0,0.173287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274653,0.274653,0.0,0.0


In [41]:
maxCos = 0
maxIdx = 0
for i in range(len(tfidf_new_pd)-1):
    d = cosDistance(tfidf_new_pd.iloc[len(tfidf_new_pd)-1],tfidf_new_pd.iloc[i])
    if maxCos < d:
        maxCos = d
        maxIdx = i

In [42]:
maxCos

0.37731249435895575

In [43]:
maxIdx

2

In [45]:
docC

'AI is the future'