In [1]:
!pip install ipynb



In [27]:
from nltk.lm.preprocessing import pad_both_ends
import random
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize
import collections 
from ipynb.fs.full.corpus import *
import copy
import itertools

In [47]:
def get_ngrams(tokens, n):
    padded = []
    final_lst = []
    padding_process = list(pad_both_ends(tokens, n))
    for i in padding_process:
        # pad_both_ends adds to the sequence "<s>" from the beginning and "</s>" from the end. 
        # that's why I iterate through the list to find "<s>" and "</s>"
        # created a new list "padded" to append the list padding_process but instead of "<s>" and "</s>",
        # I append "None" to it
        if i == "<s>" or i == "</s>":
            padded.append('None')
        else:
            padded.append(i)
    # here, instead of the list "tokens" I use "padded", because it contains both PADs and tokens
    ngram = ngrams(padded, n)
    # I iterate through the n-gram generator and append them to final_lst in order to return all n-grams
    # otherwise, the code would return only one n-gram
    for i in ngram:
        final_lst.append(i)
    return final_lst  
#get_ngrams(lines)

In [48]:
def normalize(word_counts):
    #print(word_counts.values())
    # creating new dictionary for the output
    new_dict = {}
    # summing up the values of word_counts dictionary
    sum_of_values = sum(word_counts.values())
    # iterating through word_count to access its keys and values
    for k, v in word_counts.items():
        # counting probability distribution of each word
        result = v / sum_of_values
        # adding keys and probability distribution results to the new dictionary
        new_dict[k] = result
    return new_dict
normalize({'the':2, 'cat':1, 'dog':1, 'runs':2})

{'the': 0.3333333333333333,
 'cat': 0.16666666666666666,
 'dog': 0.16666666666666666,
 'runs': 0.3333333333333333}

In [39]:
def sample(distribution):
    # convert to list for not getting TypeError
    words = list(distribution.keys())
    probabilities = distribution.values()
    # returning randomly chosen word from words
    # random.choices returns a word according to its probability distribution(that often)
    random_value = random.choices(words, probabilities, k = 1)[0]
    return random_value

In [40]:
class LanguageModel:
    def __init__(self, n):
        self.n = n
        self.counts = {}
        self.vocabulary = set()
        self.token_result = [] 
        self.all_ngrams = []
        self.dict = {}
        self.normalized_results = {}
        self.tokens = []
    def train(self, token_sequences):
        # iterating through token_sequences in order to add the values to self.vocabulary
        # since self.vocabulary shouldn't contain duplicates; I check if the value is already in self.vocabulary
        # and if not, then I add it to self.vocabulary
        #print(token_sequences)
        for lst in token_sequences:
            for el in lst:
                if el not in self.vocabulary:
                    self.vocabulary.add(el)
                else:
                    continue
        # token_sequences does not contain None, that's why I additionaly add it to self.vocabulary
                #self.vocabulary.add(None)
            #print(self.vocabulary)
        # here, I call get_ngrams function which takes lists of token_sequences and creates n-grams, 
        # based on the number assigned in self.n
            n_grams = get_ngrams(lst, self.n)
            #print(n_grams)
        # creating one list of n-grams
            for m in n_grams:
                self.all_ngrams.append(m)
            #print(self.all_ngrams)
        # counting how many times each tuple appeared
        # calling collection.Counter() and running it on all n-grams
        value = collections.Counter(self.all_ngrams) 
        #print(value)
        # iterating through the value
        for tups in value: 
            # checking if [:-1] words of the tuple are already in self.counts
            # if not, append them, [-1] word as the second key and the value as the value
            if tups[:-1] not in self.counts.keys():
                self.counts[tups[:-1]] = {}
                self.counts[tups[:-1]][tups[-1]] = value[tups]
            # if [:-1] are already there, add [-1] word to the existing tuple as the second key
            # update the self.counts and return
            elif tups[:-1] in self.counts.keys():
                self.dict[tups[-1]] = value[tups] 
                self.counts[tups[:-1]].update(self.dict)
        return self.counts
    def p_next(self, tokens):
        self.tokens = tokens
        #calling train() to tokenize, make a dict
        new = self.train(tokens)
        l = list(new.values())
        merged_dict = {}
        for this_dict in l:
            for key,value in this_dict.items():
                if key not in merged_dict:
                    merged_dict[key] = value
                elif key in merged_dict:
                    merged_dict[key] = value + 1
        norm = normalize(merged_dict)
        final = {}
        for key, val in norm.items():
            for i in self.all_ngrams:
                if key == i[-1]:
                    final[key] = val
        return final
    def generate(self):
        fin = []
        pnext = self.p_next(self.tokens)
        for i in pnext:
            word = sample(pnext)
            if word != 'None':
                fin.append(word)
            else:
                fin.append(word)
                break
            
        return fin

In [50]:
#lm = LanguageModel(3)
#lm.train([[ ' the ' , ' cat ' , ' runs ' ],[' the ' , ' dog ' , ' runs ']])
#lm.p_next(([[ ' the ' , ' cat ' , ' runs ' ],[' the ' , ' dog ' , ' runs ']]))
#lm.generate()