In [3]:
from typing import Protocol
from nltk.tokenize import word_tokenize

# Introducing Interface of text preprocessor
class TextPreprocessorI(Protocol):
    @staticmethod
    def preprocess(text: str) -> list[str]:
        ...
        
        
# Inplementation of concrete text preprocessor
class TextPreprocessor:
    @staticmethod
    def preprocess(text: str) -> list[str]:
        return word_tokenize(text)

In [4]:
corpus = [
    "The dog barks.",
    "The cat meows.",
    "The dog and cat are friends."
]

In [5]:
class BOW:
    def __init__(self, preprocessor: TextPreprocessorI = TextPreprocessor):
        self.preprocessor = preprocessor
        
        
    def fit(self, corpus: list[str]) -> None:
        self.vocab = {}
        self.inverse_vocab = {}
        word_idx = 0
        
        for text in corpus:
            for token in self.preprocessor.preprocess(text): 
                if not token in self.vocab:
                    self.vocab[token] = word_idx
                    self.inverse_vocab[word_idx] = token
                    word_idx += 1
                    
        
    def transform(self, text: str) -> list[int]:
        transformed_text = [0] * len(self.vocab)
        preprocessed_text = self.preprocessor.preprocess(text)
        
        for token in preprocessed_text:
            if token in self.vocab: transformed_text[self.vocab[token]] += 1
                
        return transformed_text
        
        
    def fit_transform(self, corpus: list[str]) -> list[list[int]]:
        self.fit(corpus)
        
        transformed_corputs = []
        for text in corpus:
            transformed_corputs.append(self.transform(text))
            
        return transformed_corputs

        
    def __repr__(self) -> str:
        return f'BOW(preprocessor={self.preprocessor})'
    
    
bow_model = BOW()
bow_model.fit_transform(corpus)

[[1, 1, 1, 1, 0, 0, 0, 0, 0],
 [1, 0, 0, 1, 1, 1, 0, 0, 0],
 [1, 1, 0, 1, 1, 0, 1, 1, 1]]

In [None]:
class NGramBOW(BOW):
    def __init__(self, n: int = 2, preprocessor: TextPreprocessorI = TextPreprocessor):
        self.preprocessor = preprocessor
        self.n = n
        
    def _build_n_grams(self, text: list[str]) -> None:
        left_pointer = 0
        right_pointer = self.n - 1
        
        while right_pointer < len(text):
            n_gram = tuple(text[left_pointer: right_pointer + 1])
            if not n_gram in self.vocab: 
                self.vocab[n_gram] = self._word_idx
                self.inverse_vocab[self._word_idx] = n_gram
                self._word_idx += 1
                
            left_pointer += 1
            right_pointer += 1
        
    def fit(self, corpus: list[str]) -> None:
        self.vocab = {}
        self.inverse_vocab = {}
        self._word_idx = 0
        
        for text in corpus:
            self._build_n_grams(self.preprocessor.preprocess(text))
        
    def transform(self, text: str) -> list[int]:
        transformed_text = [0] * len(self.vocab)
        preprocessed_text = self.preprocessor.preprocess(text)
        
        left_pointer, right_pointer = 0, self.n - 1
        
        while right_pointer < len(preprocessed_text):
            n_gram = tuple(preprocessed_text[left_pointer: right_pointer + 1])
            
            if n_gram in self.vocab: 
                transformed_text[self.vocab[n_gram]] += 1
                
            right_pointer += 1
            left_pointer += 1
            
        return transformed_text
            

    def __repr__(self)  -> str:
        return f'NGramBOW(n={self.n}, preprocessor={self.preprocessor})'
    
model = NGramBOW(n=2)
model.fit_transform(corpus)

[[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]

In [7]:
import math
from collections import defaultdict


class PPMI:
    def __init__(self, preprocessor: TextPreprocessorI = TextPreprocessor):
        self.preprocessor = preprocessor
        
        
    def _get_pairs(self, text: str, idx: int = 0):
        if len(self._pair) == 1:
            self.word_counts[self._pair[0]] += 1
            
        if len(self._pair) == 2:
            self.co_occurrence[(self._pair[0], self._pair[1])] += 1
            self.co_occurrence[(self._pair[1], self._pair[0])] += 1
            return
        
        for word_idx in range(idx, len(text)):
            self._pair.append(text[word_idx]) # step
            self._get_pairs(text, word_idx + 1) # traverce step
            self._pair.pop() # step back
        
        
    def fit(self, corpus: list[str]) -> None:
        self._pair = []
        self.co_occurrence = defaultdict(int)
        self.word_counts = defaultdict(int)
        self.paragraphs = len(corpus)
        
        for text in corpus:
            text = self.preprocessor.preprocess(text)
            self._get_pairs(text)
            
        
    def compute(self, word_1: str, word_2: str) -> int:
        if not (word_1, word_2) in self.co_occurrence and not (word_2, word_1) in self.co_occurrence:
            return 0
        
        elif (word_1, word_2) in self.co_occurrence:
            numerator = self.co_occurrence[(word_1, word_2)] / self.paragraphs
            
        elif (word_2, word_1) in self.co_occurrence:
            numerator = self.co_occurrence[(word_2, word_1)] / self.paragraphs
            
        denominator = (self.word_counts[word_1] * self.word_counts[word_1]) / len(self.word_counts) ** 2
            
        return max(math.log2(numerator / denominator), 0)
        
    def __repr__(self) -> str:
        return f'PPMI(preprocessor={self.preprocessor})'
    
model = PPMI()
model.fit(corpus)
model.compute('friends', 'are')

4.754887502163468