In [2]:
import nltk
import math
from collections import Counter
from nltk.corpus import treebank
nltk.download("treebank")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nazarlenisin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [5]:
from typing import Protocol
from nltk.tokenize import word_tokenize

# Introducing Interface of text preprocessor
class TextPreprocessorI(Protocol):
    @staticmethod
    def preprocess(text: str) -> list[str]:
        ...
        
        
# Inplementation of concrete text preprocessor
class TextPreprocessor:
    @staticmethod
    def preprocess(text: str) -> list[str]:
        return word_tokenize(text)

In [39]:
class TFIDF:
    def __init__(self, preprocessor: TextPreprocessorI = TextPreprocessor):
        self.preprocessor = preprocessor
        
        
    def _compute_tf(self, text: list[str]) -> dict:
        words = Counter(text)
        return {word: words[word] / len(text) for word in words}
    
    
    def _compute_idf(self, corpus: list[list[str]]) -> dict:
        idf = {}
        preprocessed_corpus = [set(self.preprocessor.preprocess(text)) for text in corpus]
        
        for word in set(word for text in preprocessed_corpus for word in text):
            df = sum(1 for text in preprocessed_corpus if word in text)
            idf[word] = math.log(len(corpus) / (1 + df)) + 1  
        return idf
    
    
    def compute(self, corpus: list[list[str]]):
        idf = self._compute_idf(corpus)
        tfidf = []
        
        for text in corpus:
            text = self.preprocessor.preprocess(text)
            tf = self._compute_tf(text)
            tfidf.append({word: tf[word] * idf[word] for word in text})
            
        return tfidf
    
    
    def __repr__(self):
        return f'TFIDF(preprocessor={self.preprocessor})'
    
model = TFIDF()
sentences = [" ".join(sent) for sent in treebank.sents()[:50]] 
tfidf_vectors  = model.compute(sentences)
tfidf_vectors[0]

{'Pierre': 0.2343819902704556,
 'Vinken': 0.21185615093111312,
 ',': 0.14761156299689288,
 '61': 0.2343819902704556,
 'years': 0.1733479742333384,
 'old': 0.19587381357268085,
 'will': 0.19587381357268085,
 'join': 0.2343819902704556,
 'the': 0.08034928347935663,
 'board': 0.2343819902704556,
 'as': 0.21185615093111312,
 'a': 0.10931022368120587,
 'nonexecutive': 0.21185615093111312,
 'director': 0.21185615093111312,
 'Nov.': 0.2343819902704556,
 '29': 0.2343819902704556,
 '.': 0.054455409594656674}