## TF\*IDF For Text Mining

* Event discovery (Clustering or classification)  
* Association discovery (association rule mining)  
* Trend discovery (sequential pattern mining)  

**Text should be represented by Vectors in order to be calulated.**

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

text = [
    "The Eiffel Tower was opened in 1889. The opening hours are restricted.",
    "The attractions are opened daily, except for the Eiffel Tower in Paris.",
    "The tallest building in Paris is the Eiffel Tower. The opening was in 1889.",
    "Gustave Eiffel was 57 years old in 1889."
]

query='opening'

stop_wd = ['is', 'the', 'for', 'of', 'a', 'an', 'and', 'are', 'at', 'be', 'but', 'have', 'in', 
           'on', 'or', 'may', 'must', 'not', 'of', 'off', 'to', 'too', 'was']

## Calculate TF 
**Term Frequency (TF) = number of occurrences of a term within a document**

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stop_wd)
response = vectorizer.fit(text)

vector = vectorizer.transform(text)

keys = vectorizer.vocabulary_
keys = {k: v for k, v in sorted(keys.items(), key=lambda x: x[1])}

TF = pd.DataFrame(vector.T.toarray(), index=keys.keys())
TF

Unnamed: 0,0,1,2,3
1889,1,0,1,1
57,0,0,0,1
attractions,0,1,0,0
building,0,0,1,0
daily,0,1,0,0
eiffel,1,1,1,1
except,0,1,0,0
gustave,0,0,0,1
hours,1,0,0,0
old,0,0,0,1


## Calculate DF
**Document Frequency (DF) = number of documents containing the term**

In [3]:
DF = TF.copy()
DF['df'] = TF.astype(bool).sum(axis=1)
DF

Unnamed: 0,0,1,2,3,df
1889,1,0,1,1,3
57,0,0,0,1,1
attractions,0,1,0,0,1
building,0,0,1,0,1
daily,0,1,0,0,1
eiffel,1,1,1,1,4
except,0,1,0,0,1
gustave,0,0,0,1,1
hours,1,0,0,0,1
old,0,0,0,1,1


## Calculate IDF ( log10(N/df))
* **IDF = log10(N/DF)**  

**Inverse Document Frequency (IDF) = log(No of Documents/document frequency)**

In [4]:
import numpy as np

IDF = DF.copy()
IDF['idf'] = np.log10(len(text)/DF['df'])
IDF

Unnamed: 0,0,1,2,3,df,idf
1889,1,0,1,1,3,0.124939
57,0,0,0,1,1,0.60206
attractions,0,1,0,0,1,0.60206
building,0,0,1,0,1,0.60206
daily,0,1,0,0,1,0.60206
eiffel,1,1,1,1,4,0.0
except,0,1,0,0,1,0.60206
gustave,0,0,0,1,1,0.60206
hours,1,0,0,0,1,0.60206
old,0,0,0,1,1,0.60206


## Calculate TF\*IDF
* **Weight = TF \* Log10(N/DF)**

**Large TF\*IDF ⇒ a high term-frequency and a low document-frequency**

In [5]:
pd.options.display.float_format = '{:,.6f}'.format

TF_IDF = IDF.copy()
cols = IDF.columns
for i in range(len(text)):
    TF_IDF[cols[i]] = IDF[cols[i]]*IDF['idf']
    
TF_IDF

Unnamed: 0,0,1,2,3,df,idf
1889,0.124939,0.0,0.124939,0.124939,3,0.124939
57,0.0,0.0,0.0,0.60206,1,0.60206
attractions,0.0,0.60206,0.0,0.0,1,0.60206
building,0.0,0.0,0.60206,0.0,1,0.60206
daily,0.0,0.60206,0.0,0.0,1,0.60206
eiffel,0.0,0.0,0.0,0.0,4,0.0
except,0.0,0.60206,0.0,0.0,1,0.60206
gustave,0.0,0.0,0.0,0.60206,1,0.60206
hours,0.60206,0.0,0.0,0.0,1,0.60206
old,0.0,0.0,0.0,0.60206,1,0.60206


## Calculate Length of Document 
* Document Length = sqrt(sum(TF\*IDF^2))

In [6]:
norms = []
for i in range(len(text)):
    sumc = (TF_IDF[cols[i]]*TF_IDF[cols[i]]).sum()
    norms.append(np.sqrt(sumc))

norms

[0.9681993383106197,
 1.1332592395200975,
 0.9681993383106197,
 1.2105844128091947]

### Normalization ((TF\*IDF )/Length)
* Due to different sized documents, normalization is carried out.

In [7]:
NORM = TF_IDF.copy()
cols = NORM.columns
for i in range(len(text)):
    NORM[cols[i]] = NORM[cols[i]]/norms[i]
    
NORM
#np.sqrt((NORM*NORM).sum())

Unnamed: 0,0,1,2,3,df,idf
1889,0.129042,0.0,0.129042,0.103205,3,0.124939
57,0.0,0.0,0.0,0.49733,1,0.60206
attractions,0.0,0.531264,0.0,0.0,1,0.60206
building,0.0,0.0,0.621835,0.0,1,0.60206
daily,0.0,0.531264,0.0,0.0,1,0.60206
eiffel,0.0,0.0,0.0,0.0,4,0.0
except,0.0,0.531264,0.0,0.0,1,0.60206
gustave,0.0,0.0,0.0,0.49733,1,0.60206
hours,0.621835,0.0,0.0,0.0,1,0.60206
old,0.0,0.0,0.0,0.49733,1,0.60206


## Calculate Cosine Similarity Without Normalization
* Document similarity is calculated without normalization


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cos_text = text + [query]

vectorizer = CountVectorizer(stop_words=stop_wd)
response = vectorizer.fit(cos_text)

vector = vectorizer.transform(cos_text)

keys = vectorizer.vocabulary_
keys = {k: v for k, v in sorted(keys.items(), key=lambda x: x[1])}

CTF = pd.DataFrame(vector.T.toarray(), index=keys.keys())
CTFWO = CTF.transpose()

cs = cosine_similarity(CTFWO, CTFWO)
pd.DataFrame(cs)


Unnamed: 0,0,1,2,3,4
0,1.0,0.428571,0.571429,0.308607,0.377964
1,0.428571,1.0,0.428571,0.154303,0.0
2,0.571429,0.428571,1.0,0.308607,0.377964
3,0.308607,0.154303,0.308607,1.0,0.0
4,0.377964,0.0,0.377964,0.0,1.0


## Calculate Cosine Similarity With Normalization
* Document similarity is calculated with normalization.

In [9]:
cols = CTF.columns
for i in range(len(cos_text)):
    CTF[cols[i]] = CTF[cols[i]]*NORM['idf']

CTFWI = CTF.transpose()
cs = cosine_similarity(CTFWI, CTFWI)
pd.DataFrame(cs)

Unnamed: 0,0,1,2,3,4
0,1.0,0.096816,0.129973,0.013318,0.310917
1,0.096816,1.0,0.096816,0.0,0.0
2,0.129973,0.096816,1.0,0.013318,0.310917
3,0.013318,0.0,0.013318,1.0,0.0
4,0.310917,0.0,0.310917,0.0,1.0
