* Task 0 | Task 1


In [1]:
from typing import Protocol
from nltk.tokenize import word_tokenize

# Introducing Interface of text preprocessor
class TextPreprocessorI(Protocol):
    @staticmethod
    def preprocess(text: str) -> list[str]:
        ...
        
        
# Inplementation of concrete text preprocessor
class TextPreprocessor:
    @staticmethod
    def preprocess(text: str) -> list[str]:
        return word_tokenize(text)


In [2]:
corpus = ['Machine learning is revolutionizing industries by enabling computers to learn from data and make intelligent decisions',
          'Ever wondered how Netflix knows what to recommend? That’s supervised learning! But how does an algorithm find patterns in customer behavior without labels? That’s unsupervised learning. Explore the differences and their real-world applications!',
          ' Neural networks mimic the human brain, enabling AI to recognize speech, translate languages, and even generate realistic images. Deep learning is unlocking new possibilities in healthcare, finance, and entertainment. Are you keeping up with the AI revolution?']

In [27]:
from collections import defaultdict

class NGramsModel:
    def __init__(self, n: int = 2, text_preprocessor: TextPreprocessorI = TextPreprocessor):
        self.n = n
        self.text_preprocessor = text_preprocessor
        
        
    def _build_vocab(self, corpus: list[str]) -> None:
        self.vocab = set()
        self._tokenized_text = []
        
        for text in corpus:
            processed_text = self.text_preprocessor.preprocess(text)
            self.vocab.update(processed_text)
            self._tokenized_text.append(processed_text)
        
        self.vocab.add('<eos>')
            
    @staticmethod   
    def _build_n_grams(n: int, corpus: list[list[str]]) -> None:     
        n_grams = defaultdict(int)       
        
        for text in corpus:
            text.append('<eos>')
            left_pointer = 0
            right_pointer = n - 1
            
            while right_pointer < len(text):
                n_grams[tuple(text[left_pointer: right_pointer + 1])] += 1
                left_pointer += 1
                right_pointer += 1
            
        return n_grams
            
            
    def _next_word(self, text: list[str]):
        if len(text) < self.n:
            return f'To short sentence, has to be at leat of lenght: {self.n}'
        
        provided_context = text[-self.n:]
        next_word = None
        next_word_prob = -1
        
        for potential_next_word in self.vocab:
            provided_context.append(potential_next_word)
            phrase_prob = self.phrases.get(tuple(provided_context), 0)
            provided_context.pop()
            context_prob = self.context.get(tuple(provided_context), 100)
            prob = phrase_prob / context_prob
            
            if prob > next_word_prob:
                next_word_prob = prob
                next_word = potential_next_word
                
        return next_word
        
    def fit(self, corpus: list[str]) -> None:
        self._build_vocab(corpus)
        self.context = self._build_n_grams(self.n, self._tokenized_text)
        self.phrases = self._build_n_grams(self.n + 1, self._tokenized_text)
        del self._tokenized_text
        
        
    def generate(self, text: str, max_len: int = 20):
        text = self.text_preprocessor.preprocess(text)
        treshold = len(text) + max_len
        next_word = None
        
        while next_word != '<eos>' and len(text) < treshold:
            next_word = self._next_word(text)
            text.append(next_word)
            
        return ' '.join(text)
    
    def __repr__(self) -> str:
        return f"NGramsModel(n = {self.n}, text_preprocessor = {TextPreprocessor})"
        
model = NGramsModel(n=2)
model.fit(corpus)

In [28]:
model.generate('Machine learning is', 5)

'Machine learning is revolutionizing industries by enabling computers'