In [21]:
import re
import os 
import sys
import numpy as np 
import time
import random

In [22]:
lang  = 'cwe' # 'sw'
cwe = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()

In [23]:
chars = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'

In [24]:
character_list = re.findall(r"[%s]"%chars, cwe)

##  
https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0
https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0

In [25]:

def tokenize(text: str) -> list[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams
    ngrams of tuple form: ((previous wordS!), target word)
    """
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l

In [26]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = get_ngrams(n, sentence)
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]
                
    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result
    
    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context] # this can be all the tokens 
        temp_prob_list = []
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)
#         print(map_to_probs)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
#             print(map_to_probs[token])
#             print(summ)
            if summ > r:
                return token
            

    def generate_text(self, token_count: int, history):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        if type(history)==str:
            history = list(history)
        n = self.n
#         context_queue = (n - len(history) -1 ) * ['<START>'] + list(history)
        context_queue = (n -1 ) * ['<START>'] 

        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ''.join(result)

In [27]:
chars = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'

In [28]:
def create_ngram_model(n, text_list):
    m = NgramModel(n)
    m.update(text_list)
    return m

In [29]:
if __name__ == "__main__":
    start = time.time()
    lang  = 'cwe' # 'sw'
    cwe = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()
    chars = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'
    character_list = re.findall(r"[%s]"%chars, cwe)
    
    m = create_ngram_model(15, character_list)

    print (f'Language Model creating time: {time.time() - start}')
    print()
    start = time.time()
    random.seed(5)
    
#     print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(100, 'a'))
#     print(f'{"="*50}')

Language Model creating time: 1.3443078994750977

chikale vinogile fana viya wanhu wose wowakalile womulavila nhosa chiya waja wowosang'hanila mbuli z


In [None]:
if __name__ == "__main__":
    start = time.time()
    lang  = 'cwe' # 'sw'
    cwe = open(os.path.join('data',lang+'-train.txt'), 'r').read().lower()
    chars = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'
    character_list = re.findall(r"[%s]"%chars, cwe)
    
    m = create_ngram_model(5, character_list)

    print (f'Language Model creating time: {time.time() - start}')
    print()
    start = time.time()
    random.seed(5)
    
#     print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(100,'s'))
#     print(f'{"="*50}')

In [75]:
print(m.random_token(tuple((14) * ['<START>'])), 'v')

{'c': 1.0}
1.0
1.0
c v
