A step by step mathematical and code-based guide on demystifying TF-IDF values by calculating them on a mystic poem by Rumi.

In [15]:
import pandas as pd
import numpy as np

In [1]:
corpus =  ["you were born with potential",
"you were born with goodness and trust",
"you were born with ideals and dreams",
"you were born with greatness",
"you were born with wings",
"you are not meant for crawling, so don't",
"you have wings",
"learn to use them and fly"
]

In [2]:
#transform the tf idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
X_train_tf_idf = tf_idf_vect.fit_transform(corpus)
terms = tf_idf_vect.get_feature_names()

In [3]:
terms

['and',
 'are',
 'born',
 'crawling',
 'don',
 'dreams',
 'fly',
 'for',
 'goodness',
 'greatness',
 'have',
 'ideals',
 'learn',
 'meant',
 'not',
 'potential',
 'so',
 'them',
 'to',
 'trust',
 'use',
 'were',
 'wings',
 'with',
 'you']

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(corpus)
terms = count_vect.get_feature_names()

In [5]:
terms

['and',
 'are',
 'born',
 'crawling',
 'don',
 'dreams',
 'fly',
 'for',
 'goodness',
 'greatness',
 'have',
 'ideals',
 'learn',
 'meant',
 'not',
 'potential',
 'so',
 'them',
 'to',
 'trust',
 'use',
 'were',
 'wings',
 'with',
 'you']

In [7]:
# create a dataframe from a word matrix
def dtm2df(wm, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)

def idf2df(wm, feat_names):
  
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm, index=[0],
                      columns=feat_names)
    return(df)

In [14]:
df_tf_idf = dtm2df(X_train_tf_idf ,terms)
df_tf_idf

Unnamed: 0,and,are,born,crawling,don,dreams,fly,for,goodness,greatness,...,potential,so,them,to,trust,use,were,wings,with,you
Doc0,0.0,0.0,0.383289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.682895,0.0,0.0,0.0,0.0,0.0,0.383289,0.0,0.383289,0.304834
Doc1,0.37764,0.0,0.293087,0.0,0.0,0.0,0.0,0.0,0.522185,0.0,...,0.0,0.0,0.0,0.0,0.522185,0.0,0.293087,0.0,0.293087,0.233096
Doc2,0.37764,0.0,0.293087,0.0,0.0,0.522185,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.293087,0.0,0.293087,0.233096
Doc3,0.0,0.0,0.383289,0.0,0.0,0.0,0.0,0.0,0.0,0.682895,...,0.0,0.0,0.0,0.0,0.0,0.0,0.383289,0.0,0.383289,0.304834
Doc4,0.0,0.0,0.413022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.413022,0.616716,0.413022,0.328481
Doc5,0.0,0.372697,0.0,0.372697,0.372697,0.0,0.0,0.372697,0.0,0.0,...,0.0,0.372697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166366
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.607744,0.0,0.323703
Doc7,0.307727,0.0,0.0,0.0,0.0,0.0,0.425512,0.0,0.0,0.0,...,0.0,0.0,0.425512,0.425512,0.0,0.425512,0.0,0.0,0.0,0.0


In [18]:
# Find the number of times each time a word appears in a document (a sentence in the case of our corpus)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(corpus)
terms = count_vect.get_feature_names()

df_count = dtm2df(X_train_counts ,terms)
df_count

Unnamed: 0,and,are,born,crawling,don,dreams,fly,for,goodness,greatness,...,potential,so,them,to,trust,use,were,wings,with,you
Doc0,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,1
Doc1,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,1,1
Doc2,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
Doc3,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,1
Doc4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
Doc5,0,1,0,1,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
Doc6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
Doc7,1,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,1,0,0,0,0


In [20]:
# explore idf
# idf_ attribute can be used to extract IDF values
# transpose the 1D IDF array to convert to a dataframe to make it easy to visualise
df_idf = idf2df(tf_idf_vect.idf_[:,np.newaxis].T ,terms)
df_idf

Unnamed: 0,and,are,born,crawling,don,dreams,fly,for,goodness,greatness,...,potential,so,them,to,trust,use,were,wings,with,you
0,1.81093,2.504077,1.405465,2.504077,2.504077,2.504077,2.504077,2.504077,2.504077,2.504077,...,2.504077,2.504077,2.504077,2.504077,2.504077,2.504077,1.405465,2.098612,1.405465,1.117783


In [21]:
#elment wise dot product
df_mul = df_count.mul(df_idf.to_numpy())
df_mul

Unnamed: 0,and,are,born,crawling,don,dreams,fly,for,goodness,greatness,...,potential,so,them,to,trust,use,were,wings,with,you
Doc0,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.504077,0.0,0.0,0.0,0.0,0.0,1.405465,0.0,1.405465,1.117783
Doc1,1.81093,0.0,1.405465,0.0,0.0,0.0,0.0,0.0,2.504077,0.0,...,0.0,0.0,0.0,0.0,2.504077,0.0,1.405465,0.0,1.405465,1.117783
Doc2,1.81093,0.0,1.405465,0.0,0.0,2.504077,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.405465,0.0,1.405465,1.117783
Doc3,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.0,0.0,2.504077,...,0.0,0.0,0.0,0.0,0.0,0.0,1.405465,0.0,1.405465,1.117783
Doc4,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.405465,2.098612,1.405465,1.117783
Doc5,0.0,2.504077,0.0,2.504077,2.504077,0.0,0.0,2.504077,0.0,0.0,...,0.0,2.504077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.117783
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.098612,0.0,1.117783
Doc7,1.81093,0.0,0.0,0.0,0.0,0.0,2.504077,0.0,0.0,0.0,...,0.0,0.0,2.504077,2.504077,0.0,2.504077,0.0,0.0,0.0,0.0


In [22]:
from sklearn.preprocessing import Normalizer
df_mul.iloc[:,:] = Normalizer(norm='l2').fit_transform(df_mul)
df_mul

Unnamed: 0,and,are,born,crawling,don,dreams,fly,for,goodness,greatness,...,potential,so,them,to,trust,use,were,wings,with,you
Doc0,0.0,0.0,0.383289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.682895,0.0,0.0,0.0,0.0,0.0,0.383289,0.0,0.383289,0.304834
Doc1,0.37764,0.0,0.293087,0.0,0.0,0.0,0.0,0.0,0.522185,0.0,...,0.0,0.0,0.0,0.0,0.522185,0.0,0.293087,0.0,0.293087,0.233096
Doc2,0.37764,0.0,0.293087,0.0,0.0,0.522185,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.293087,0.0,0.293087,0.233096
Doc3,0.0,0.0,0.383289,0.0,0.0,0.0,0.0,0.0,0.0,0.682895,...,0.0,0.0,0.0,0.0,0.0,0.0,0.383289,0.0,0.383289,0.304834
Doc4,0.0,0.0,0.413022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.413022,0.616716,0.413022,0.328481
Doc5,0.0,0.372697,0.0,0.372697,0.372697,0.0,0.0,0.372697,0.0,0.0,...,0.0,0.372697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166366
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.607744,0.0,0.323703
Doc7,0.307727,0.0,0.0,0.0,0.0,0.0,0.425512,0.0,0.0,0.0,...,0.0,0.0,0.425512,0.425512,0.0,0.425512,0.0,0.0,0.0,0.0


In [23]:
#transform the count vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf_idf = tf_transformer.transform(X_train_counts)
X_train_tf_idf.shape
print(dtm2df(X_train_tf_idf ,terms))

           and       are      born  crawling       don    dreams       fly  \
Doc0  0.000000  0.000000  0.383289  0.000000  0.000000  0.000000  0.000000   
Doc1  0.377640  0.000000  0.293087  0.000000  0.000000  0.000000  0.000000   
Doc2  0.377640  0.000000  0.293087  0.000000  0.000000  0.522185  0.000000   
Doc3  0.000000  0.000000  0.383289  0.000000  0.000000  0.000000  0.000000   
Doc4  0.000000  0.000000  0.413022  0.000000  0.000000  0.000000  0.000000   
Doc5  0.000000  0.372697  0.000000  0.372697  0.372697  0.000000  0.000000   
Doc6  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Doc7  0.307727  0.000000  0.000000  0.000000  0.000000  0.000000  0.425512   

           for  goodness  greatness  ...  potential        so      them  \
Doc0  0.000000  0.000000   0.000000  ...   0.682895  0.000000  0.000000   
Doc1  0.000000  0.522185   0.000000  ...   0.000000  0.000000  0.000000   
Doc2  0.000000  0.000000   0.000000  ...   0.000000  0.000000  0.000000 