In [54]:
from nltk.probability import (FreqDist, ConditionalFreqDist, ConditionalProbDist, MLEProbDist, SimpleGoodTuringProbDist)
from nltk.util import ngrams

In [55]:
def ml_estimator(freqdist):
    return MLEProbDist(freqdist)


def goodturing_estimator(freqdist):
    return SimpleGoodTuringProbDist(freqdist)

def read_file(file_path):
    file_path = file_path
    words = []
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            line_tokens = line.lower().split(" ")
            words += line_tokens
    return words

def generate_text(ngram, n, length=100):
    context = tuple([ngram._start_symbol] * (n - 1))
    result = list(context)
    for i in range(length):
        if context in ngram._counter:
            prob_dist = ngram[context]
            word = prob_dist.generate()
        else:
            word = ngram._end_symbol
            
        result.append(word)
        
        if word == ngram._end_symbol:
            break
        context = tuple(result[-(n-1):])
    return ' '.join(result)

In [56]:
class BasicNgram(ConditionalProbDist):
    """
    Define and train an Ngram Model over the corpus represented by the list words. 
    Given an BasicNgram instance ngram and a (n-1)-gram context (i.e., a tuple of n-1 strings), 
    a call to ngram[context] returns a nltk.probability.ProbDistI object representing the Probability distribution P(.|context) over possible values for the next word. 
    Be aware that context has to be a tuple, even if context is a unigram (see example below)
    
    >>> corpus=['a','b','b','a']
    >>> bigram=BasicNgram(2,corpus)
    >>> bigram.contexts()
    [('<$>',), ('a',), ('b',)]
    >>> p_b=bigram[('b',)] #not bigram['b']!!!
    >>> p_b.prob('a')
    0.5
    >>> p_b.prob('b')
    0.5
    
    :param n: the dimension of the n-grams (i.e. the size of the context+1).
    :type n: int
    :param corpus: 
    :type corpus: list(Str)
    
    other parameters are optional and may be omitted. They define whether to add artificial symbols before or after the word list, 
    and whether to use another estimation methods than maximum likelihood.
    """

    def __init__(self, n, words, start_symbol="<$>", end_symbol="</$>", pad_left=True, pad_right=False,
                 estimator=ml_estimator):
        assert (n > 0)
        self._n = n
        self._words = words
        self._counter = ConditionalFreqDist()
        self._start_symbol = start_symbol
        self._end_symbol = end_symbol
        self._pad_left = pad_left
        self._pad_right = pad_right
        self._train()
        super().__init__(self._counter, estimator)

    def _train(self):
        _ngrams = self.generate_ngrams()
        for ngram in _ngrams:
            context = ngram[0:-1]
            outcome = ngram[-1]
            self._counter[context][outcome] += 1

    """
    returns an iterable over the ngrams of the word corpus
    """

    def generate_ngrams(self):
        return ngrams(self._words, self._n, pad_left=self._pad_left, pad_right=self._pad_right,
                      left_pad_symbol=self._start_symbol,
                      right_pad_symbol=self._end_symbol)

    """                                                                                                                                                                                                                                                                                                                                                               
    Return the list of contexts                                                                                                                                                                                                                                                                                                                                       
    """

    def contexts(self):
        return list(self.conditions())

In [57]:
file_path = './kingjamesbible_tokenized.txt'

corpus = read_file(file_path)

print(corpus[:15])

['in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.\n', 'and', 'the', 'earth', 'was']


In [58]:
bigram = BasicNgram(2, corpus)

bigram.contexts()

[('<$>',),
 ('in',),
 ('the',),
 ('beginning',),
 ('god',),
 ('created',),
 ('heaven',),
 ('and',),
 ('earth',),
 ('.\n',),
 ('was',),
 ('without',),
 ('form',),
 (',',),
 ('void',),
 (';',),
 ('darkness',),
 ('upon',),
 ('face',),
 ('of',),
 ('deep',),
 ('.',),
 ('spirit',),
 ('moved',),
 ('waters',),
 ('said',),
 ('let',),
 ('there',),
 ('be',),
 ('light',),
 (':',),
 ('saw',),
 ('that',),
 ('it',),
 ('good',),
 ('divided',),
 ('from',),
 ('called',),
 ('day',),
 ('he',),
 ('night',),
 ('evening',),
 ('morning',),
 ('were',),
 ('first',),
 ('a',),
 ('firmament',),
 ('midst',),
 ('divide',),
 ('made',),
 ('which',),
 ('under',),
 ('above',),
 ('so',),
 ('second',),
 ('gathered',),
 ('together',),
 ('unto',),
 ('one',),
 ('place',),
 ('dry',),
 ('land',),
 ('appear',),
 ('gathering',),
 ('seas',),
 ('bring',),
 ('forth',),
 ('grass',),
 ('herb',),
 ('yielding',),
 ('seed',),
 ('fruit',),
 ('tree',),
 ('after',),
 ('his',),
 ('kind',),
 ('whose',),
 ('is',),
 ('itself',),
 ('brought',),

In [59]:
generate_text(bigram, 2, 100)

101


'<$> in spirit .\n brethren pitched in believing , why leap , with ten shekels , he had gathered together , and be astonished .\n but one of israel , nethaneel the lord of the deaf , and from jerusalem one of josiah , and he hath sent messengers again , i will punish the lord for a familiar spirits more .\n pray thee glad when we are not eaten the lord ; neither shall be changed .\n him with my bowels : they in the linen clothes also after his house of lebanon under ashdothpisgah , or , that thou'

In [60]:
trigram = BasicNgram(3, corpus)

trigram.contexts()

[('<$>', '<$>'),
 ('<$>', 'in'),
 ('in', 'the'),
 ('the', 'beginning'),
 ('beginning', 'god'),
 ('god', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the'),
 ('the', 'earth'),
 ('earth', '.\n'),
 ('.\n', 'and'),
 ('earth', 'was'),
 ('was', 'without'),
 ('without', 'form'),
 ('form', ','),
 (',', 'and'),
 ('and', 'void'),
 ('void', ';'),
 (';', 'and'),
 ('and', 'darkness'),
 ('darkness', 'was'),
 ('was', 'upon'),
 ('upon', 'the'),
 ('the', 'face'),
 ('face', 'of'),
 ('of', 'the'),
 ('the', 'deep'),
 ('deep', '.'),
 ('.', 'and'),
 ('the', 'spirit'),
 ('spirit', 'of'),
 ('of', 'god'),
 ('god', 'moved'),
 ('moved', 'upon'),
 ('the', 'waters'),
 ('waters', '.\n'),
 ('and', 'god'),
 ('god', 'said'),
 ('said', ','),
 (',', 'let'),
 ('let', 'there'),
 ('there', 'be'),
 ('be', 'light'),
 ('light', ':'),
 (':', 'and'),
 ('and', 'there'),
 ('there', 'was'),
 ('was', 'light'),
 ('light', '.\n'),
 ('god', 'saw'),
 ('saw', 'the'),
 ('the', 'light'),
 ('light', ',

In [61]:
generate_text(trigram, 3 ,100)

102


'<$> <$> in the heavens shall give her him to death , written and engraven in stones , and of treasures by a whirlwind , that men pray every where to lay upon him , concerning whom the king an answer unto rehum the chancellor and shimshai the scribe told the words of god ; and the son of shelemiah , shemariah ,\n shallum , amariah , and girded him with the king .\n now the levites , which have not charity , and said unto him , why are ye troubled ? and why art thou , o great hailstones ,'

In [62]:
four_gram = BasicNgram(4, corpus)

four_gram.contexts()

[('<$>', '<$>', '<$>'),
 ('<$>', '<$>', 'in'),
 ('<$>', 'in', 'the'),
 ('in', 'the', 'beginning'),
 ('the', 'beginning', 'god'),
 ('beginning', 'god', 'created'),
 ('god', 'created', 'the'),
 ('created', 'the', 'heaven'),
 ('the', 'heaven', 'and'),
 ('heaven', 'and', 'the'),
 ('and', 'the', 'earth'),
 ('the', 'earth', '.\n'),
 ('earth', '.\n', 'and'),
 ('.\n', 'and', 'the'),
 ('the', 'earth', 'was'),
 ('earth', 'was', 'without'),
 ('was', 'without', 'form'),
 ('without', 'form', ','),
 ('form', ',', 'and'),
 (',', 'and', 'void'),
 ('and', 'void', ';'),
 ('void', ';', 'and'),
 (';', 'and', 'darkness'),
 ('and', 'darkness', 'was'),
 ('darkness', 'was', 'upon'),
 ('was', 'upon', 'the'),
 ('upon', 'the', 'face'),
 ('the', 'face', 'of'),
 ('face', 'of', 'the'),
 ('of', 'the', 'deep'),
 ('the', 'deep', '.'),
 ('deep', '.', 'and'),
 ('.', 'and', 'the'),
 ('and', 'the', 'spirit'),
 ('the', 'spirit', 'of'),
 ('spirit', 'of', 'god'),
 ('of', 'god', 'moved'),
 ('god', 'moved', 'upon'),
 ('moved',

In [63]:
generate_text(four_gram, 4, 100)

103


'<$> <$> <$> in the beginning of months : it shall not come nigh the vessels of the altar with thy finger , and put them in your vessels , and the word of the lord ,\n and to them that are turned back , and saw a woman in the valley of elah , even kenaz .\n and the priest shall look on him , that he bless himself in the field .\n for the day .\n he hath fenced up my way that i can not pass ;\n turn from him , and his anger was kindled against jacob , and'