In [1]:
import re
from typing import List

In [8]:
from abc import ABC, abstractmethod
from typing import List, Any

class LabPredictor(ABC):
    def __init__(self, model: Any=None) -> None:
        self.model = model

    @abstractmethod
    def predict(self, text: str) -> List[str]:
        """ the main predictor function. this should return a list of strings that will be visible in the frontend keyboard

        Args:
            text (str): the input text from the frontend keyboard
        """
        raise NotImplementedError

    @abstractmethod
    def train(self) -> None:
        """ the main training function. this should train the model with the chosen data in each lab
        """
        raise NotImplementedError


In [9]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.text import TextCollection

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from nltk import *
from nltk.corpus.reader.util import *
import nltk.collocations

In [24]:
# pylint: disable=pointless-string-statement
""" Welcome to the first lab!"""

' Welcome to the first lab!'

In [26]:
class NgramModel():
    """ The main class for all n-gram models
    Here you will create your model (based on N)
    and complete the predict method to return the most likely words.
    
    """
    def __init__(self, n_gram=1) -> None:
        """ the init method should load/train your model
        Args:
            n_gram (int, optional): 2=bigram, 2=trigram, ... Defaults to 1.
        """
        print(f"Loading {n_gram}-gram model...")
        self.n_gram = n_gram
        self.words_to_return = 4  # how many words to show in the UI
        self.model = BigramModel  # TODO: implement the model using built-in NLTK methods
        # take a look at the nltk.collocations module
        # https://www.nltk.org/howto/collocations.html
    def predict(self, tokens: List[str]) -> List[str]:
        """ given a list of tokens, return the most likely next words
        Args:
            tokens (List[str]): preprocessed tokens from the LabPredictor
        Returns:
            List[str]: selected candidates for next-word prediction
        """
        # we're only interested in the last n-1 words.
        # e.g. for a bigram model,
        # we're only interested in the last word to predict the next
        n_tokens = tokens[-(self.n_gram - 1):]
        
        probabilities = [] # TODO: find the probabilities for the next word(s)
        
            

        # TODO: apply some filtering to only select the words
        # here you're free to select your filtering methods
        # a simple approach is to simply sort them by probability
        best_matches = []  # TODO: sort/filter to your liking

        # then return as many words as you've defined above
        return best_matches[:self.words_to_return]

In [35]:
class BigramModel(NgramModel):
    def __init__(self, corpustext) -> None:
        super().__init__(n_gram=2)
        self.corpustext = corpustext
        print(self.corpustext + " hey")
    
    def collocations(self):
        finder = BigramCollocationFinder.from_words(
            self.words)
        bigram_measures = nltk.collocations.BigramAssocMeasures() # Measures unusual frequent bigram associations
        finder.apply_freq_filter(7) # Add to get the most frequent expressions
        finder.nbest(bigram_measures.pmi, 20) # Top collocations
    
    def find_tfidf(self):
        vectorizer = CountVectorizer(ngram_range =(2, 2))
        X1 = vectorizer.fit_transform(filtered)
        features = (vectorizer.get_feature_names_out())
        print("\n\nX1 : \n", X1.toarray())

        # Applying TFIDF
        # You can still get n-grams here
        vectorizer = TfidfVectorizer(ngram_range = (2, 2))
        X2 = vectorizer.fit_transform(filtered)
        scores = (X2.toarray())
        print("\n\nScores : \n", scores)
        
    

In [28]:
class TrigramModel(NgramModel):
    corpustext = None
    def __init__(self, corpus: StreamBackedCorpusView) -> None:
        super().__init__(n_gram=3)

In [34]:
class Lab1(LabPredictor):
    def __init__(self):
        super().__init__()
        self.corpora = nltk.corpus.gutenberg  # TODO: load a corpus from NLTK

        # Define a strategy to select the first words (when there's no input)
        # TODO: this should not be a predefined list
        self.start_words = ["the", "a", "an", "this", "that", "these", "those"]
    @staticmethod
    def preprocess(text: str) -> List[str]:
        """
        Preprocess the input text as you see fit, return a list of tokens.
        - should you consider parentheses, punctuation?
        - lowercase?
        - find inspiration from the course literature :-)
        """
        # TODO: filters here
        stopwords = nltk.corpus.stopwords.words('english')
        filtered = []
        for sentence in text:
            for word in sentence:
                if(word.isalpha() and word not in stopwords and len(word) > 2):
                    filtered.append(word.lower())
        return filtered # Tokenized and filtered
    def predict(self, input_text):
        if not bool(input_text):  # if there's no input...
            print("No input, using start words")
            return self.start_words

        # make use of the backoff model (e.g. bigram)
        too_few = len(input_text) < 3  # TODO: check if the input is too short for trigrams
        
        tokens = self.preprocess(input_text)

        # select the correct model based on the condition
        model = self.backoff_model if too_few else self.model
        # alternatively, you can switch between the tri- and bigram models
        # based on the output probabilities. This is 100% optional!
        return model.predict(tokens)
    def train(self) -> None:
        """ train or load the models
        add parameters as you like, such as the corpora you selected.
        """
        print("Training models...")
        self.model = TrigramModel(self.corpora.sents("chesterton-brown.txt"))  # TODO: add needed parameters
        print()
        self.backoff_model = BigramModel(self.corpora.sents("chesterton-brown.txt"))  # TODO: add needed parameters
        self.sents = self.corpora.sents("chesterton-brown.txt")
        self.tokens = self.predict(self.sents)