In [1]:
import nltk
import math
from collections import Counter
from nltk.corpus import treebank
nltk.download("treebank")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
from typing import Protocol
from nltk.tokenize import word_tokenize

# Introducing Interface of text preprocessor
class TextPreprocessorI(Protocol):
    @staticmethod
    def preprocess(text: str) -> list[str]:
        ...
        
        
# Inplementation of concrete text preprocessor
class TextPreprocessor:
    @staticmethod
    def preprocess(text: str) -> list[str]:
        return word_tokenize(text)

In [21]:
import numpy as np

class TFIDF:
    def __init__(self, preprocessor: TextPreprocessorI = TextPreprocessor):
        self.preprocessor = preprocessor
        self._vocab = {}
        self._inverse_vocab = {}
        self._word_idx = 0
        
        
    def _compute_tf(self, text: list[str]) -> dict:
        words = Counter(text)
        return {word: words[word] / len(text) for word in words}
    
    
    def _compute_idf(self, corpus: list[list[str]]) -> dict:
        idf = {}
        preprocessed_corpus = [set(self.preprocessor.preprocess(text)) for text in corpus]
        
        for word in set(word for text in preprocessed_corpus for word in text):
            self._vocab[word] = self._word_idx
            self._inverse_vocab[self._word_idx] = word
            self._word_idx += 1
            df = sum(1 for text in preprocessed_corpus if word in text)
            idf[word] = math.log(len(corpus) / (1 + df)) + 1  
        return idf
    
    
    def compute(self, corpus: list[list[str]]):
        idf = self._compute_idf(corpus)
        tfidf = []
        
        for text in corpus:
            transformed_sentese = [0] * len(self._vocab)
            text = self.preprocessor.preprocess(text)
            tf = self._compute_tf(text)
            
            for word in text:
                transformed_sentese[self._vocab[word]] = tf[word] * idf[word]
                
            tfidf.append(transformed_sentese)
            # {word: tf[word] * idf[word] for word in text}
            
        return tfidf
    
    
    def __repr__(self):
        return f'TFIDF(preprocessor={self.preprocessor})'
    
model = TFIDF()
sentences = [" ".join(sent) for sent in treebank.sents()[:50]] 
tfidf_vectors  = model.compute(sentences)
np_tfidf = np.array(tfidf_vectors)

In [27]:
sentences[0], sentences[1], sentences[40]

('Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .',
 'Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .',
 "Nevertheless , said *T*-1 Brenda Malizia Negus , editor of Money Fund Report , yields `` may blip up again before they blip down '' because of recent rises in short-term interest rates .")

In [26]:
import numpy 

np.dot(np_tfidf[0], np_tfidf[1]) / np.sqrt(np.sum((np_tfidf[0] ** 2))) * np.sqrt(np.sum((np_tfidf[1] ** 2)))


0.10370626536282739

In [28]:
np.dot(np_tfidf[0], np_tfidf[40]) / np.sqrt(np.sum((np_tfidf[0] ** 2))) * np.sqrt(np.sum((np_tfidf[40] ** 2)))

0.012428576863757396