# Text Retrieval-  DF-IDF

#### To start, we’ll import the necessary libraries.

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.spatial import distance

#### Simple example.

In [20]:
documentA = 'The child head out to the pumpkin patch'
documentB = 'The man sit in a chair'

#### bag of words model. 

In [23]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
print(bagOfWordsA)
print(bagOfWordsB)

['The', 'child', 'head', 'out', 'to', 'the', 'pumpkin', 'patch']
['The', 'man', 'sit', 'in', 'a', 'chair']


#### build dictionary

In [24]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [25]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
print(numOfWordsA)
print(numOfWordsB)

{'The': 1, 'head': 1, 'to': 1, 'chair': 0, 'child': 1, 'out': 1, 'pumpkin': 1, 'sit': 0, 'patch': 1, 'man': 0, 'a': 0, 'in': 0, 'the': 1}
{'The': 1, 'head': 0, 'to': 0, 'chair': 1, 'child': 0, 'out': 0, 'pumpkin': 0, 'sit': 1, 'patch': 0, 'man': 1, 'a': 1, 'in': 1, 'the': 0}


In [26]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [27]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(tfA)
print(tfB)

{'The': 0.125, 'head': 0.125, 'to': 0.125, 'chair': 0.0, 'child': 0.125, 'out': 0.125, 'pumpkin': 0.125, 'sit': 0.0, 'patch': 0.125, 'man': 0.0, 'a': 0.0, 'in': 0.0, 'the': 0.125}
{'The': 0.16666666666666666, 'head': 0.0, 'to': 0.0, 'chair': 0.16666666666666666, 'child': 0.0, 'out': 0.0, 'pumpkin': 0.0, 'sit': 0.16666666666666666, 'patch': 0.0, 'man': 0.16666666666666666, 'a': 0.16666666666666666, 'in': 0.16666666666666666, 'the': 0.0}


In [28]:
def computeIDF(documents):
    import math
    N = len(documents)

    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(1+N / float(val))
    return idfDict

In [29]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

{'The': 0.6931471805599453, 'head': 1.0986122886681098, 'to': 1.0986122886681098, 'chair': 1.0986122886681098, 'child': 1.0986122886681098, 'out': 1.0986122886681098, 'pumpkin': 1.0986122886681098, 'sit': 1.0986122886681098, 'patch': 1.0986122886681098, 'man': 1.0986122886681098, 'a': 1.0986122886681098, 'in': 1.0986122886681098, 'the': 1.0986122886681098}


### Now you need to complete the function computeTFIDF:

In [32]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {key: tfBagOfWords.get(key,1)*idfs.get(key,1) for key in set(tfBagOfWords)|set(idfs)}
    return tfidf
    tfidf_sum=sum(tfidf.values())
    print(tfidf_sum)

In [33]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df.head()

Unnamed: 0,The,head,child,out,a,the,to,chair,pumpkin,sit,patch,man,in
0,0.086643,0.137327,0.137327,0.137327,0.0,0.137327,0.137327,0.0,0.137327,0.0,0.137327,0.0,0.0
1,0.115525,0.0,0.0,0.0,0.183102,0.0,0.0,0.183102,0.0,0.183102,0.0,0.183102,0.183102


In [34]:
tfidfA_v=np.array(list(tfidfA.values()))
tfidfB_v=np.array(list(tfidfB.values()))
print(tfidfA_v)
print(tfidfB_v)

[0.0866434  0.13732654 0.13732654 0.13732654 0.         0.13732654
 0.13732654 0.         0.13732654 0.         0.13732654 0.
 0.        ]
[0.11552453 0.         0.         0.         0.18310205 0.
 0.         0.18310205 0.         0.18310205 0.         0.18310205
 0.18310205]


In [35]:
score=1-distance.cosine(tfidfA_v, tfidfB_v)
print(score)

0.06299170724076686


#### Rather than manually implementing TF-IDF as above, we could use the library provided by sklearn.

In [None]:
documentA = 'run fast'
documentB = 'The man sit in a pumpkin chair'

In [14]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
print(dense.shape)
print(dense)
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.head()

(2, 11)
[[0.         0.35300279 0.35300279 0.         0.         0.35300279
  0.35300279 0.35300279 0.         0.50232878 0.35300279]
 [0.47107781 0.         0.         0.47107781 0.47107781 0.
  0.         0.         0.47107781 0.33517574 0.        ]]


Unnamed: 0,chair,child,head,in,man,out,patch,pumpkin,sit,the,to
0,0.0,0.353003,0.353003,0.0,0.0,0.353003,0.353003,0.353003,0.0,0.502329,0.353003
1,0.471078,0.0,0.0,0.471078,0.471078,0.0,0.0,0.0,0.471078,0.335176,0.0


In [None]:
score=1-distance.cosine(dense[0,].A1, dense[1,].A1)
print(score)