# Minimalistic Implementation of TF-IDF

In [None]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords 
from scipy import spatial
import pandas as pd

In [None]:
# A sample of 4 documents
docA = "When Antony saw that Julius Caesar lay dead"
docB = "The world saw the demise of Julius Caesar"
docC = "Antony saw Julius Caesar lay dead"
docD = "It was him my cat"

In [None]:
stop_words = set(stopwords.words('english'))

bowA = TreebankWordTokenizer().tokenize(docA)
bowB = TreebankWordTokenizer().tokenize(docB)
bowC = TreebankWordTokenizer().tokenize(docC)

nbowA = []
nbowB = []
nbowC = []

nbowD = docD.split(" ")

for i,j,k in zip(bowA,bowB,bowC): 
    if i not in stop_words: 
        nbowA.append(i) 
    if j not in stop_words: 
        nbowB.append(j) 
    if k not in stop_words: 
        nbowC.append(k) 

In [None]:
wordSet = set(nbowA).union(set(nbowB)).union(set(nbowC)).union(set(nbowD))
wordSet

In [None]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0) 
wordDictC = dict.fromkeys(wordSet, 0) 
wordDictD = dict.fromkeys(wordSet, 0) 

In [None]:
for word in nbowA:
    wordDictA[word]+=1
    
for word in nbowB:
    wordDictB[word]+=1

for word in nbowC:
    wordDictC[word]+=1
    
for word in nbowD:
    wordDictD[word]+=1

In [None]:
print(wordDictA)
print(wordDictB)
print(wordDictC)
print(wordDictD)

In [None]:
pd.DataFrame([wordDictA, wordDictB, wordDictC, wordDictD])

In [None]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [None]:
tfBowA = computeTF(wordDictA, nbowA)
tfBowB = computeTF(wordDictB, nbowB)
tfBowC = computeTF(wordDictC, nbowC)
tfBowD = computeTF(wordDictD, nbowD)

In [None]:
print(tfBowA)
print(tfBowB)
print(tfBowC)
print(tfBowD)

In [None]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict   

In [None]:
idfs = computeIDF([wordDictA, wordDictB, wordDictC, wordDictD])

In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [None]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)
tfidfBowC = computeTFIDF(tfBowC, idfs)
tfidfBowD = computeTFIDF(tfBowD, idfs)

In [None]:
D = pd.DataFrame([tfidfBowA, tfidfBowB, tfidfBowC, tfidfBowD])
D

In [None]:
A = list(D.iloc[0])
B = list(D.iloc[1])
C = list(D.iloc[2])
D = list(D.iloc[3])

### Cosine Similarity between Doc A and B

In [None]:
result = 1 - spatial.distance.cosine(A,B)
result

### Cosine Similarity between Doc A and C

In [None]:
result = 1 - spatial.distance.cosine(A,C)
result

### Cosine Similarity between Doc A and D

In [None]:
result = 1 - spatial.distance.cosine(A,D)
result