In [1]:
from IPython.display import clear_output

In [2]:
# %pip install numpy tqdm

%pip install datasets

clear_output()

# Conent

In this demo, we will custom implement a TF-IDF calculator/model

TF-IDF is a technique of converting words into numerical representations so they can then be further used by models to perform some task

The cool thing about TF-IDF is that they have the ability to see how important a word is and they assign important words a higher value than non-important words

This means, that stop words will get a lower value assigned to them which means we don't have to manually remove stop words or atleast that the need for it is reduced

In [3]:
import math
from collections import Counter
import string

import numpy as np
from tqdm import tqdm

from datasets import load_dataset

## Defining TF-IDF calculation functions

In [4]:
def calculate_tf(documents):

    doc_tfs = []

    for doc in tqdm(documents, desc='Calculating TFs'):
        tf_doc = {}
        word_count = len(doc.split())
        word_freq = Counter(doc.split())
        for word, freq in word_freq.items():
            tf_doc[word] = freq / word_count
        doc_tfs.append(tf_doc)

    return doc_tfs

def calculate_idf(tfs):

    token_to_idfs = {}
    doc_count = len(tfs)
    all_tokens = [token for tf in tfs for token in tf]
    for token in tqdm(all_tokens, desc='Calculating IDFs'):
        doc_containing_token = sum([1 for tf in tfs if token in tf])
        token_to_idfs[token] = math.log(doc_count / (1 + doc_containing_token))

    return token_to_idfs

def calculate_tfidf(documents):

    documents = [doc.lower() for doc in documents]

    doc_tfs = calculate_tf(documents)
    token_to_idfs = calculate_idf(doc_tfs)

    doc_tfidfs = []

    for doc_tf in tqdm(doc_tfs, desc='Calculating TF-IDFs'):

        doc_tfidf = {token: tf*token_to_idfs[token] for token, tf in doc_tf.items()}
        doc_tfidfs.append(doc_tfidf)

    return doc_tfidfs

def word_to_tf_idf_vector(word, doc_tfidfs):
    # A TF-IDF vector representation of a token/word is taken as a vector made of tf-idf of that token in all documents

    vector = [doc_tfidf.get(word.lower(), 0) for doc_tfidf in doc_tfidfs]
    vector = np.array(vector)

    return vector


## Downloading the dataset

we will train on IMDB movie reviews dataset

In [9]:
train_data = load_dataset('imdb', split='train')['text']
train_data = train_data[:500]  # shorten the data because all 25k rows take too long to train

In [10]:
# remove punctuations from the data

translator = str.maketrans('', '', string.punctuation)
train_data = list(map(lambda x: x.translate(translator), tqdm(train_data)))

100%|██████████| 500/500 [00:00<00:00, 25329.76it/s]


In [11]:
doc_tfidfs = calculate_tfidf(train_data)

Calculating TFs: 100%|██████████| 500/500 [00:00<00:00, 11820.60it/s]
Calculating IDFs: 100%|██████████| 67217/67217 [00:05<00:00, 12672.14it/s]
Calculating TF-IDFs: 100%|██████████| 500/500 [00:00<00:00, 24849.24it/s]


## Lets visualize some vectors

In [12]:
# word_to_tf_idf_vector("the", doc_tfidfs)  # for vector
print(f'Vector sum for the word "the": {word_to_tf_idf_vector("the", doc_tfidfs).sum()}')
print(f'Vector sum for the word "plot": {word_to_tf_idf_vector("plot", doc_tfidfs).sum()}')

Vector sum for the word "the": 0.16488058605906525
Vector sum for the word "plot": 1.2984830346401326


As we see above, the vector sum of the word 'the' is very smaller compared to the word 'plot' because 'the' is a stop word

we can use these vectors (without summing) as numerical representations of words to train models for other tasks