Useful resources:</br>
1. Step by step calculation with toy example https://towardsdatascience.com/introduction-to-nlp-part-3-tf-idf-explained-cedb1fc1f7dc</br>
2. Theory https://en.wikipedia.org/wiki/Tf%E2%80%93idf</br>

Notes:</br>
Images taken from: https://towardsdatascience.com/introduction-to-nlp-part-3-tf-idf-explained-cedb1fc1f7dc

# Terminology

![](https://drive.google.com/uc?id=1PSt5byrcHMo03bp69mRG-e846bLtgV3J)
![](https://drive.google.com/uc?id=1mXErO7hqC62RKR9AzUdJOk9ZcnymL1oq)

# Common code

Import packages and modules

In [1]:
import nltk
import math
from google.colab import data_table
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import copy
import numpy as np
from functools import reduce
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

#Sklearn implementation

## Train

In [3]:
# Create a dataframe
d1 = 'I thought, I thought of thinking of thanking you for the gift'
d2 = 'She was thinking of going to go and get you a GIFT!'
X_train = pd.DataFrame({'text': [d1, d2]})
print(X_train)

                                                text
0  I thought, I thought of thinking of thanking y...
1  She was thinking of going to go and get you a ...


This is how preprocessed text looks like

In [4]:
X_train['preprocessed'] = X_train.apply(lambda row: preprocess_text(row['text']), axis=1)
print(X_train['preprocessed'])

0    [think, think, think, thank, gift]
1            [think, go, go, get, gift]
Name: preprocessed, dtype: object


In [5]:
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
vectorized = vectorizer.fit_transform(X_train['text'])
print('(doc_id, token_id)   tf-idf value')
print(vectorized)

(doc_id, token_id)   tf-idf value
  (0, 1)	0.2889723007058112
  (0, 3)	0.40614048585175794
  (0, 4)	0.8669169021174337
  (1, 0)	0.4078241041497786
  (1, 2)	0.8156482082995572
  (1, 1)	0.29017020899133733
  (1, 4)	0.29017020899133733


Vocabulary

In [6]:
print(vectorizer.vocabulary_.items())

dict_items([('think', 4), ('thank', 3), ('gift', 1), ('go', 2), ('get', 0)])


In [7]:
# Convert sparse matrix to dataframe
X_train_df = pd.DataFrame.sparse.from_spmatrix(vectorized)
print(X_train_df)

          0         1         2        3         4
0  0.000000  0.288972  0.000000  0.40614  0.866917
1  0.407824  0.290170  0.815648  0.00000  0.290170


In [8]:
# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectorizer.vocabulary_.items()}

# Rename each column using the mapping
for col in X_train_df.columns:
    X_train_df.rename(columns={col: col_map[col]}, inplace=True)
X_train_df

Unnamed: 0,get,gift,go,thank,think
0,0.0,0.288972,0.0,0.40614,0.866917
1,0.407824,0.29017,0.815648,0.0,0.29017


## Test

In [9]:
d3 = 'He thinks he will go!'
d4 = 'They don’t know what to buy!'
# Create dataframe
X_test = pd.DataFrame({'text': [d3, d4]})
# Transform to feature matrix
vectorized = vectorizer.transform(X_test['text']) # already trained vectorizer!
# Convert sparse matrix to dataframe
X_test = pd.DataFrame.sparse.from_spmatrix(vectorized)
# Add column names to make it more readible
for col in X_test.columns:
    X_test.rename(columns={col: col_map[col]}, inplace=True)
X_test

Unnamed: 0,get,gift,go,thank,think
0,0.0,0.0,0.814802,0.0,0.579739
1,0.0,0.0,0.0,0.0,0.0


# TF-IDF from scratch - step by step

## Train

Define documents

In [10]:
d1 = 'I thought, I thought of thinking of thanking you for the gift'
d2 = 'She was thinking of going to go and get you a GIFT!'

X_train = pd.DataFrame({'text': [d1, d2]})
print(X_train['text'])

0    I thought, I thought of thinking of thanking y...
1    She was thinking of going to go and get you a ...
Name: text, dtype: object


Tokenize documents

In [11]:
X_train['preprocessed'] = X_train.apply(lambda row: preprocess_text(row['text']), axis=1)
print(X_train['preprocessed'])

0    [think, think, think, thank, gift]
1            [think, go, go, get, gift]
Name: preprocessed, dtype: object


Create vocabulary

In [12]:
def get_vocabulary(X_train):
    sets = [set(x) for x in X_train['preprocessed']]
    vocabulary = reduce(lambda x, y: x.union(y), sets)
    vocabulary = list(vocabulary)
    vocabulary = sorted(vocabulary)
    token_to_idx = {token:index for index, token in enumerate(vocabulary)}
    idx_to_token = {index:token for index, token in enumerate(vocabulary)}
    return vocabulary, token_to_idx, idx_to_token

vocabulary, token_to_idx, idx_to_token = get_vocabulary(X_train)

print(vocabulary)
print(token_to_idx)
print(idx_to_token)

['get', 'gift', 'go', 'thank', 'think']
{'get': 0, 'gift': 1, 'go': 2, 'thank': 3, 'think': 4}
{0: 'get', 1: 'gift', 2: 'go', 3: 'thank', 4: 'think'}


Count token repetition per document

In [13]:
X_train['token_count'] = X_train.apply(lambda row: Counter(row['preprocessed']), axis=1)
print(X_train['token_count'])

0           {'think': 3, 'thank': 1, 'gift': 1}
1    {'think': 1, 'go': 2, 'get': 1, 'gift': 1}
Name: token_count, dtype: object


![](https://drive.google.com/uc?id=1i2_iLeggL9HfvQ-mCqWCV5QErsHKByHm)

For each term in vocabulary, per document

In [14]:
def get_tf_raw(row, idx_to_token):
    tf_raw = [0] * len(idx_to_token)
    for idx, token in idx_to_token.items():
        tf_raw[idx] = row['token_count'][token]
    return tf_raw

X_train['tf_raw'] = X_train.apply(lambda row: get_tf_raw(row, idx_to_token), axis=1)

print(X_train['tf_raw'])
print()
print(pd.DataFrame.from_records(data=X_train['tf_raw'].values, columns=vocabulary))

0    [0, 1, 0, 1, 3]
1    [1, 1, 2, 0, 1]
Name: tf_raw, dtype: object

   get  gift  go  thank  think
0    0     1   0      1      3
1    1     1   2      0      1


![](https://drive.google.com/uc?id=19gmz9zxGAcd264N_ebxRp5H_D2C73fWo)

What proportion does a term represent in a document?

$tf = \frac {tf_{raw}}{len(tokenized document)}$

In [15]:
def get_tf(row):
    array = np.array(row['tf_raw']) / sum(row['tf_raw'])
    return array.tolist()

X_train['tf'] = X_train.apply(lambda row: get_tf(row), axis=1)

print(X_train['tf'])
print()
print(pd.DataFrame.from_records(data=X_train['tf'].values, columns=vocabulary))

0    [0.0, 0.2, 0.0, 0.2, 0.6]
1    [0.2, 0.2, 0.4, 0.0, 0.2]
Name: tf, dtype: object

   get  gift   go  thank  think
0  0.0   0.2  0.0    0.2    0.6
1  0.2   0.2  0.4    0.0    0.2


![](https://drive.google.com/uc?id=1Y7x4rqNnCr_LvG_Nb-GkPcCy2JYS10hF)

How many documents contain the particular term?

In [16]:
def get_df(idx_to_token, X_train):
    df = [0] * len(idx_to_token)
    for idx, token in idx_to_token.items():
        token_count_per_doc = X_train.apply(lambda row: token in row['token_count'], axis=1)
        df[idx] = token_count_per_doc.sum()
    return df

df = get_df(idx_to_token, X_train)

print('df =', df)
print()
print(pd.DataFrame.from_records(data=np.array([df]), columns=vocabulary))

df = [1, 2, 1, 1, 2]

   get  gift  go  thank  think
0    1     2   1      1      2


![](https://drive.google.com/uc?id=1HskZsW95dI-gukuyKCTF1yHibmNx0Ngz)

![](https://drive.google.com/uc?id=1p2WuKQ_ukDaoDaRFm9B06GsGHqvkUWuS)

n stands for number of documents

In [17]:
def get_idf(df, X_train, idx_to_token):
    n = len(X_train)
    idf = [0] * len(idx_to_token)
    for idx, token in idx_to_token.items():
        idf[idx] = math.log( (n + 1) / (df[idx] + 1) ) + 1
    return idf

idf = get_idf(df, X_train, idx_to_token)

print(idf)
print()
print(pd.DataFrame.from_records(data=np.array([idf]), columns=vocabulary))

[1.4054651081081644, 1.0, 1.4054651081081644, 1.4054651081081644, 1.0]

        get  gift        go     thank  think
0  1.405465   1.0  1.405465  1.405465    1.0


![](https://drive.google.com/uc?id=113SUGovyxN0kSjiz-ZjwNtIz9CS1CSa5)

![](https://drive.google.com/uc?id=1ccYIZQkB5cxhy5BWi6xHc7RWq9Ncwbv7)

In [18]:
def get_tf_idf_raw(row, idf):
    result = [a*b for a, b in zip(row['tf_raw'], idf)]
    return result

X_train['tf-idf_raw'] = X_train.apply(lambda row: get_tf_idf_raw(row, idf) , axis=1)

print(X_train['tf-idf_raw'])
print()
print(pd.DataFrame.from_records(data=X_train['tf-idf_raw'].values, columns=vocabulary))

0             [0.0, 1.0, 0.0, 1.4054651081081644, 3.0]
1    [1.4054651081081644, 1.0, 2.8109302162163288, ...
Name: tf-idf_raw, dtype: object

        get  gift       go     thank  think
0  0.000000   1.0  0.00000  1.405465    3.0
1  1.405465   1.0  2.81093  0.000000    1.0


![](https://drive.google.com/uc?id=1eStnC3zaaNRRmeMa4219t8CysH62MRjK)
![](https://drive.google.com/uc?id=1eK3Z0v2DYAci9YLYXx-ZIMG9YZzGFwLy)

In [19]:
def get_tf_idf_divider(row):
    divider = [math.pow(x, 2) for x in row['tf-idf_raw']]
    divider = reduce(lambda x, y: x + y, divider)
    return math.sqrt(divider)

def get_tf_idf(row):
    return [x / row['tf-idf_divider'] if row['tf-idf_divider'] != 0 else 0  for x in row['tf-idf_raw']]

X_train['tf-idf_divider'] = X_train.apply(lambda row: get_tf_idf_divider(row), axis=1)
X_train['tf-idf'] = X_train.apply(lambda row: get_tf_idf(row), axis=1)

print(X_train['tf-idf'])
print()
pd.DataFrame.from_records(data=X_train['tf-idf'].values, columns=vocabulary)

0    [0.0, 0.2889723007058112, 0.0, 0.4061404858517...
1    [0.4078241041497786, 0.29017020899133733, 0.81...
Name: tf-idf, dtype: object



Unnamed: 0,get,gift,go,thank,think
0,0.0,0.288972,0.0,0.40614,0.866917
1,0.407824,0.29017,0.815648,0.0,0.29017


## Test

USES VOCABULARY AND IDF FROM TRAINING !</br>
Everything else should be recalculated using test data

In [20]:
# init
d1 = 'He thinks he will go!'
d2 = 'They don’t know what to buy!'
X_test = pd.DataFrame({'text': [d1, d2]})
print(X_test['text'])

0           He thinks he will go!
1    They don’t know what to buy!
Name: text, dtype: object


In [21]:
# preprocess
X_test['preprocessed'] = X_test.apply(lambda row: preprocess_text(row['text']), axis=1)
print(X_test['preprocessed'])

0    [think, go]
1    [know, buy]
Name: preprocessed, dtype: object


In [22]:
# token count
X_test['token_count'] = X_test.apply(lambda row: Counter(row['preprocessed']), axis=1)
print(X_test['token_count'])

0    {'think': 1, 'go': 1}
1    {'know': 1, 'buy': 1}
Name: token_count, dtype: object


In [23]:
# tf_raw = token count per document
X_test['tf_raw'] = X_test.apply(lambda row: get_tf_raw(row, idx_to_token), axis=1) # internaly uses vocabulary from training!

print(X_test['tf_raw'])
print()
print(pd.DataFrame.from_records(data=X_test['tf_raw'].values, columns=vocabulary))

0    [0, 0, 1, 0, 1]
1    [0, 0, 0, 0, 0]
Name: tf_raw, dtype: object

   get  gift  go  thank  think
0    0     0   1      0      1
1    0     0   0      0      0


In [24]:
# tf-idf raw
X_test['tf-idf_raw'] = X_test.apply(lambda row: get_tf_idf_raw(row, idf) , axis=1) # idf from training!

print(X_test['tf-idf_raw'])
print()
print(pd.DataFrame.from_records(data=X_test['tf-idf_raw'].values, columns=vocabulary))

0    [0.0, 0.0, 1.4054651081081644, 0.0, 1.0]
1                   [0.0, 0.0, 0.0, 0.0, 0.0]
Name: tf-idf_raw, dtype: object

   get  gift        go  thank  think
0  0.0   0.0  1.405465    0.0    1.0
1  0.0   0.0  0.000000    0.0    0.0


In [25]:
# tf-idf
X_test['tf-idf_divider'] = X_test.apply(lambda row: get_tf_idf_divider(row), axis=1)
X_test['tf-idf'] = X_test.apply(lambda row: get_tf_idf(row), axis=1)

print(X_test['tf-idf'])
print()
pd.DataFrame.from_records(data=X_test['tf-idf'].values, columns=vocabulary)

0    [0.0, 0.0, 0.8148024746671689, 0.0, 0.57973867...
1                                      [0, 0, 0, 0, 0]
Name: tf-idf, dtype: object



Unnamed: 0,get,gift,go,thank,think
0,0.0,0.0,0.814802,0.0,0.579739
1,0.0,0.0,0.0,0.0,0.0


# Custom TF-IDF vectorizer

## Custom Vectorizer

In [None]:
class MyTfidfVectorizer():
    vocabulary = []
    token_to_idx = {}
    idx_to_token = {}
    analyzer = None
    idf = None
    
    def __init__(self, analyzer=None):
        if analyzer is not None:
            self.analyzer = analyzer
        else:
            self.analyzer = self._default_analyzer

    def fit_transform(self, data: pd.Series):
        self.fit(data)
        return self.transform(data)

    def fit(self, train_data: pd.Series):
        X_train = train_data.to_frame()
        X_train.columns = ['text']

        X_train['preprocessed'] = X_train.apply(lambda row: self.analyzer(row['text']), axis=1)
        self.vocabulary, self.token_to_idx, self.idx_to_token = get_vocabulary(X_train)
        X_train['token_count'] = X_train.apply(lambda row: Counter(row['preprocessed']), axis=1)

        df = get_df(self.idx_to_token, X_train)
        self.idf = get_idf(df, X_train, self.idx_to_token)

    def transform(self, data: pd.Series):
        X_test = data.to_frame()
        X_test.columns = ['text']

        X_test['preprocessed'] = X_test.apply(lambda row: self.analyzer(row['text']), axis=1)
        X_test['token_count'] = X_test.apply(lambda row: Counter(row['preprocessed']), axis=1)
        X_test['tf_raw'] = X_test.apply(lambda row: get_tf_raw(row, self.idx_to_token), axis=1)

        X_test['tf-idf_raw'] = X_test.apply(lambda row: get_tf_idf_raw(row, self.idf) , axis=1)
        X_test['tf-idf_divider'] = X_test.apply(lambda row: get_tf_idf_divider(row), axis=1)
        X_test['tf-idf'] = X_test.apply(lambda row: get_tf_idf(row), axis=1)
        return X_test['tf-idf']

    def _default_analyzer(self, text):
        # Tokenise words while ignoring punctuation
        tokeniser = RegexpTokenizer(r'\w+')
        return tokeniser.tokenize(text.lower())

## Train

In [None]:
d1 = 'I thought, I thought of thinking of thanking you for the gift'
d2 = 'She was thinking of going to go and get you a GIFT!'

X_train = pd.DataFrame({'text': [d1, d2]})
print(X_train['text'])

0    I thought, I thought of thinking of thanking y...
1    She was thinking of going to go and get you a ...
Name: text, dtype: object


In [None]:
# Create an instance of TfidfVectorizer
vectorizer = MyTfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
vectorized = vectorizer.fit_transform(X_train['text'])

data_table.DataTable(pd.DataFrame.from_records(data=vectorized.values, columns=vectorizer.vocabulary))

Unnamed: 0,get,gift,go,thank,think
0,0.0,0.288972,0.0,0.40614,0.866917
1,0.407824,0.29017,0.815648,0.0,0.29017


## Test

In [None]:
d3 = 'He thinks he will go!'
d4 = 'They don’t know what to buy!'

X_test = pd.DataFrame({'text': [d3, d4]})
print(X_test['text'])

0           He thinks he will go!
1    They don’t know what to buy!
Name: text, dtype: object


In [None]:
# transform
vectorized = vectorizer.transform(X_test['text'])

data_table.DataTable(pd.DataFrame.from_records(data=vectorized.values, columns=vectorizer.vocabulary))

Unnamed: 0,get,gift,go,thank,think
0,0.0,0.0,0.814802,0.0,0.579739
1,0.0,0.0,0.0,0.0,0.0
