In [15]:
import re
import os 
import sys
import numpy as np 
import time
import random

In [16]:
lang  = 'sw' # 'cwe'
sw = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()
n_gram_value = 5

In [17]:
chars = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'

In [18]:
character_list = re.findall(r"[%s]"%chars, sw)

##  The reference for this implementation is: 
1. https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0
2. https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0

In [19]:

def tokenize(text: str) -> list[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams
    ngrams of tuple form: ((previous wordS!), target word)
    """
    tokens = (n-1)*['<START>'] + tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l

In [74]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = []
        
        ngram_choices = np.arange(1, n-1, 2, dtype=int).tolist()
        ngram_choices.append(n)
        for i in ngram_choices:
            ngrams += get_ngrams(i, sentence)
            
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]
                
    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result
    
    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        if context in self.context.keys(): 
            token_of_interest = self.context[context] # this can be all the tokens 
        else: 
            while context not in self.context.keys():
                context = context[-len(context)+1:]
            token_of_interest = self.context[context]
        temp_prob_list = []
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)

        map_to_probs = {k: v for k, v in sorted(map_to_probs.items(), key=lambda item: item[1])}
        token = np.random.choice(list(map_to_probs.keys()))
        prob = map_to_probs[token]
        return token, prob
    
    def check_all_gram(self, context, token, token_of_interest):
        if token not in token_of_interest: 
            context = context[-len(context)+1:]
            while context not in self.context.keys():
                context = context[-len(context)+1:]
                self.check_all_gram(context,token, token_of_interest)
        else: 
            return context, token
        
    def get_token_prob(self, context, token):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        
        if context in self.context.keys(): 
            token_of_interest = self.context[context] # this can be all the tokens 
            self.check_all_gram(context,token, token_of_interest)
                                 
        else: 
            while context not in self.context.keys():
                context = context[-len(context)+1:]
            token_of_interest = self.context[context]
            context, token = self.check_all_gram(context,token, token_of_interest)
                    
        prob = self.prob(context, token)
        return token, prob
            
            

    def generate_text(self, token_count: int, history):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        if type(history)==str:
            history = list(history)
        n = self.n
#         context_queue = (n - len(history) -1 ) * ['<START>'] + list(history)
        context_queue = (n -1 ) * ['<START>'] 

        result = []
        for _ in range(token_count):
            obj, prob = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)

        return ''.join(result)

In [75]:
def create_ngram_model(n, text_list):
    m = NgramModel(n)
    m.update(text_list)
    return m

In [76]:
start = time.time()
lang  = 'sw' # 'sw'

sw = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()   
sw = list(sw)


from_scratch_model = create_ngram_model(n_gram_value, sw)

print (f'Language Model creating time: {time.time() - start}')
print()
start = time.time()
random.seed(5)

print(from_scratch_model.generate_text(100,'s'))

Language Model creating time: 275.0068016052246

kuisitu,mtayris evas ila lyellai.kuise.kuisakshot, mpanggilarancescau '.kuisem,a kwwli tz,je bink, t


In [None]:
from_scratch_model = create_ngram_model(n_gram_value, sw)


In [80]:
def evaluate_one(lang):
    testfile = open(os.path.join('data',lang+'-test.txt'), 'r')
    max_history = n_gram_value
#     history = (n_gram_value-1) * ['<START>']
    history = list(testfile.read(n_gram_value-1))
    loss_anything_goes = 0
    loss_from_scratch = 0
    count = 0
    states = None
    total_loss_list = []
    while count<n_gram_value:
        c = testfile.read(1)
        if not c:
            break
        count += 1 
        t, prob = from_scratch_model.get_token_prob(tuple(history),c)
        loss_from_scratch -= np.log2(prob)
        if len(history) == max_history:
            history.pop(0)
        history.append(c)
    return loss_from_scratch/count
     

In [81]:
evaluate_one(lang)  # The result for 5-gram

2.453924448850197

In [82]:
n_gram_value = 7 
start = time.time()

sw = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()   
sw = list(sw)


from_scratch_model = create_ngram_model(n_gram_value, sw)

print (f'Language Model creating time: {time.time() - start}')
print()
start = time.time()
random.seed(5)

print(from_scratch_model.generate_text(100,'s'))
evaluate_one(lang)

Language Model creating time: 474.08321380615234

kuishi" ivi keshi juu ynu nyinyiemu nimfate mwngne unalotuwezekufannanishtakiwa.kuishi!!! bonye.kuis


1.3444301558482612

In [83]:
n_gram_value = 9 
start = time.time()

sw = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()   
sw = list(sw)


from_scratch_model = create_ngram_model(n_gram_value, sw)

print (f'Language Model creating time: {time.time() - start}')
print()
start = time.time()
random.seed(5)

print(from_scratch_model.generate_text(100,'s'))
evaluate_one(lang)

Language Model creating time: 558.5557587146759

kuishi vema.kuishi vitandani,anaenda icu bila tanzana wenzie job ndugai: wewe kafarnaumu, unashangil


0.6103501391295509