# Applications with TF-IDF

The term frequency-inverse document frequency is a measure that quantifies the importance of a word in the context of a document or a *corpus*.

The *term-frequency* of a word is the relative frequency of the term in the context of the document.

$$\text{TF}(t,d):=\frac{\text{# of times the term appears in the document}}{\text{# of terms in the document }}$$


The *inverse document frequency* is defined as:

$$\text{IDF}(t,d):=\log\left(\frac{\text{# of documents}}{\text{# of documents with term } t}\right)$$

Our quantification of relative importance is defined as the product between TF and IDF.

TF-IDF gives larger values for less frequent words and is high when both IDF and TF values are high, for instance the word is rare in all the documents combined but frequent in a single document.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
documentA = 'jurassic world was the pinnacle of human achievement'
documentB = 'human kind would be better without jurassic world'

In [None]:
documents = []

In [None]:
documents.append(documentA)
documents.append(documentB)

In [None]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [None]:
len(bagOfWordsA)

8

In [None]:
1/8

0.125

In [None]:
corpus = set(bagOfWordsA).union(set(bagOfWordsB))

In [None]:
len(corpus) # all the different features on which we train the classifier

13

In [None]:
words = dict.fromkeys(corpus, 0)

In [None]:
words

{'achievement': 0,
 'be': 0,
 'better': 0,
 'human': 0,
 'jurassic': 0,
 'kind': 0,
 'of': 0,
 'pinnacle': 0,
 'the': 0,
 'was': 0,
 'without': 0,
 'world': 0,
 'would': 0}

In [None]:
WordsA = dict.fromkeys(corpus, 0)
for word in bagOfWordsA:
    WordsA[word] += 1
WordsB = dict.fromkeys(corpus, 0)
for word in bagOfWordsB:
    WordsB[word] += 1

In [None]:
WordsA

{'achievement': 1,
 'be': 0,
 'better': 0,
 'human': 1,
 'jurassic': 1,
 'kind': 0,
 'of': 1,
 'pinnacle': 1,
 'the': 1,
 'was': 1,
 'without': 0,
 'world': 1,
 'would': 0}

In [None]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [None]:
tfA = computeTF(WordsA, words)
tfB = computeTF(WordsB, words)

In [None]:
tfA

{'achievement': 0.07692307692307693,
 'be': 0.0,
 'better': 0.0,
 'human': 0.07692307692307693,
 'jurassic': 0.07692307692307693,
 'kind': 0.0,
 'of': 0.07692307692307693,
 'pinnacle': 0.07692307692307693,
 'the': 0.07692307692307693,
 'was': 0.07692307692307693,
 'without': 0.0,
 'world': 0.07692307692307693,
 'would': 0.0}

In [None]:
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [None]:
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [None]:
'world' in documents[0].split()

True

In [None]:
idfs = computeIDF(documents)

AttributeError: ignored

In [None]:
idfs

{'achievement': 0.6931471805599453,
 'be': 0.6931471805599453,
 'better': 0.6931471805599453,
 'human': 0.0,
 'jurassic': 0.0,
 'kind': 0.6931471805599453,
 'of': 0.6931471805599453,
 'pinnacle': 0.6931471805599453,
 'the': 0.6931471805599453,
 'was': 0.6931471805599453,
 'without': 0.6931471805599453,
 'world': 0.0,
 'would': 0.6931471805599453}

In [None]:
def computeTFIDF(words, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [None]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [None]:
df

Unnamed: 0,the,of,jurassic,was,pinnacle,achievement,would,without,kind,better,be,human,world
0,0.086643,0.086643,0.0,0.086643,0.086643,0.086643,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.086643,0.086643,0.086643,0.086643,0.086643,0.0,0.0


In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [None]:
df

Unnamed: 0,achievement,be,better,human,jurassic,kind,of,pinnacle,the,was,without,world,would
0,0.391668,0.0,0.0,0.278675,0.278675,0.0,0.391668,0.391668,0.391668,0.391668,0.0,0.278675,0.0
1,0.0,0.391668,0.391668,0.278675,0.278675,0.391668,0.0,0.0,0.0,0.0,0.391668,0.278675,0.391668


---------------------------


## Application of TF-IDF to Amazon Reviews

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# this data is available via Kaggle
df = pd.read_csv('drive/MyDrive/Data Sets/amazon_reviews.csv', quoting=2 )
# Extract the ratings and text reviews
data = df[['reviews.text', 'reviews.rating']].dropna().reset_index(drop=True)

reviews = data['reviews.text']
y = data['reviews.rating']

In [None]:
# for the number of stars we say 5 star is a hit and less than 5 is a miss
yb = y.where(y==5, other=0).where(y<5, other=1)

In [None]:
import re
# Stopword dictionary
from nltk.corpus import stopwords
nltk.download('stopwords')
# For stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Here we throw the stopwords and then root the remaining ones in each sentence. The punctuation is also removed.
allreviews = []
for i in range(len(reviews)):
    txt = re.sub('[^a-zA-Z0-9 ]','',reviews[i])
    txt = txt.lower()
    txt = txt.split()
    txt = [word for word in txt if not word in set(stopwords.words('english'))]
    txt = [stemmer.stem(word) for word in txt]
    txt = ' '.join(txt)
    allreviews.append(txt)

## The following is where we apply the TF-IDF vectorizer

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(allreviews)

In [None]:
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
Xreviews = pd.DataFrame(denselist, columns=feature_names)

In [None]:
Xreviews.iloc[0:3,400:410]

Unnamed: 0,ah,ahead,ai,ailment,aim,air,airconditioningheat,airplay,airreceiv,ala
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
Xtrain,Xtest,ytrain,ytest = train_test_split(Xreviews,yb,random_state=310,test_size=0.25)

In [None]:
model = LogisticRegression(random_state=310, solver='lbfgs')
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
cm = confusion_matrix(ytest, ypred)
pd.DataFrame(cm, columns=['Not 5', '5'], index =['Not 5', '5'])

Unnamed: 0,Not 5,5
Not 5,33,83
5,10,169


In [None]:
model.score(Xtest,ytest)

0.6474576271186441

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
cm = confusion_matrix(ytest, ypred)
pd.DataFrame(cm, columns=['Not 5', '5'], index =['Not 5', '5'])

Unnamed: 0,Not 5,5
Not 5,81,35
5,69,110


In [None]:
model.score(Xtrain,ytrain)

0.8356009070294784

In [None]:
model.score(Xtest,ytest)

0.6474576271186441