In [1]:
from nltk.probability import (FreqDist, ConditionalFreqDist, ConditionalProbDist, MLEProbDist, SimpleGoodTuringProbDist)
from nltk.util import ngrams

In [2]:
def ml_estimator(freqdist):
    return MLEProbDist(freqdist)

def goodturing_estimator(freqdist):
    return SimpleGoodTuringProbDist(freqdist)

def read_file(file_path):
    words = []
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            line_tokens = line.lower().split(" ")
            words += line_tokens
    return words

# # use the generate method from the NLTK class ProbDistI to generate the next random word
def generate_text(ngram, n, length=100):
    # # add the padding start symbol to the init context
    context = tuple([ngram._start_symbol] * (n - 1))
    result = list(context)
    for i in range(length):
        if context in ngram._counter:
            prob_dist = ngram[context]
            # # predict the next word
            word = prob_dist.generate()
        else:
            word = ngram._end_symbol
        
        result.append(word)
        
        if word == ngram._end_symbol:
            break
        # # update the context    
        context = tuple(result[-(n-1):])
        
    return ' '.join(result)

In [3]:
class BasicNgram(ConditionalProbDist):
    """
    Define and train an Ngram Model over the corpus represented by the list words. 
    Given an BasicNgram instance ngram and a (n-1)-gram context (i.e., a tuple of n-1 strings), 
    a call to ngram[context] returns a nltk.probability.ProbDistI object representing the Probability distribution P(.|context) over possible values for the next word. 
    Be aware that context has to be a tuple, even if context is a unigram (see example below)
    
    >>> corpus=['a','b','b','a']
    >>> bigram=BasicNgram(2,corpus)
    >>> bigram.contexts()
    [('<$>',), ('a',), ('b',)]
    >>> p_b=bigram[('b',)] #not bigram['b']!!!
    >>> p_b.prob('a')
    0.5
    >>> p_b.prob('b')
    0.5
    
    :param n: the dimension of the n-grams (i.e. the size of the context+1).
    :type n: int
    :param corpus: 
    :type corpus: list(Str)
    
    other parameters are optional and may be omitted. They define whether to add artificial symbols before or after the word list, 
    and whether to use another estimation methods than maximum likelihood.
    """

    def __init__(self, n, words, start_symbol="<$>", end_symbol="</$>", pad_left=True, pad_right=False,
                 estimator=ml_estimator):
        assert (n > 0)
        self._n = n
        self._words = words
        self._counter = ConditionalFreqDist()
        self._start_symbol = start_symbol
        self._end_symbol = end_symbol
        self._pad_left = pad_left
        self._pad_right = pad_right
        self._train()
        super().__init__(self._counter, estimator)

    def _train(self):
        _ngrams = self.generate_ngrams()
        for ngram in _ngrams:
            context = ngram[0:-1]
            outcome = ngram[-1]
            self._counter[context][outcome] += 1

    """
    returns an iterable over the ngrams of the word corpus
    """

    def generate_ngrams(self):
        return ngrams(self._words, self._n, pad_left=self._pad_left, pad_right=self._pad_right,
                      left_pad_symbol=self._start_symbol,
                      right_pad_symbol=self._end_symbol)

    """                                                                                                                                                                                                                                                                                                                                                               
    Return the list of contexts                                                                                                                                                                                                                                                                                                                                       
    """

    def contexts(self):
        return list(self.conditions())

In [4]:
file_path = './kingjamesbible_tokenized.txt'

corpus = read_file(file_path)

print(corpus[:15])

['in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.\n', 'and', 'the', 'earth', 'was']


In [5]:
# # 2-gram
bigram = BasicNgram(2, corpus)

bigram.contexts()[:10]

[('<$>',),
 ('in',),
 ('the',),
 ('beginning',),
 ('god',),
 ('created',),
 ('heaven',),
 ('and',),
 ('earth',),
 ('.\n',)]

In [6]:
generate_text(bigram, 2, 100)

'<$> in the people : the medes and escaped as ye weigh silver .\n thus saith the people that they have kept the fear , that she doted : and the people with sweet incense , and twenty thousand , which were put your lords : so .\n if these sayings were porters ; from the sapphire , and upon his holy garments , and they went in the renowned , lo , like wool and the wilderness , how long will stretch out .\n and your work of the thunderings , that is of his arms , when he hath'

In [7]:
# # 3-gram
trigram = BasicNgram(3, corpus)

trigram.contexts()[:10]

[('<$>', '<$>'),
 ('<$>', 'in'),
 ('in', 'the'),
 ('the', 'beginning'),
 ('beginning', 'god'),
 ('god', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the')]

In [8]:
generate_text(trigram, 3 ,100)

'<$> <$> in the waters which run among the manassites , from the evil that i shall smite all the city , and the seven heads and worshipped ,\n and the tortoise after his kind .\n but without a cause to inherit .\n o god , who was over the men of babylon .\n one board .\n and moses went up against you , and on that side , much more than others ? do that which i command thee this day unto these my two sons .\n and the word of the heathen .\n and moses said unto the wise more'

In [9]:
# # 4-gram
four_gram = BasicNgram(4, corpus)

four_gram.contexts()[:10]

[('<$>', '<$>', '<$>'),
 ('<$>', '<$>', 'in'),
 ('<$>', 'in', 'the'),
 ('in', 'the', 'beginning'),
 ('the', 'beginning', 'god'),
 ('beginning', 'god', 'created'),
 ('god', 'created', 'the'),
 ('created', 'the', 'heaven'),
 ('the', 'heaven', 'and'),
 ('heaven', 'and', 'the')]

In [10]:
generate_text(four_gram, 4, 100)

'<$> <$> <$> in the beginning of the world , and the amorites , in whose heart the lord opened the eyes of the lord god be witness against you this day .\n therefore the wild beasts of the field which the lord sware unto thy fathers , therefore he giveth you rest from all his works .\n verily i say unto you , saying , we have piped unto you , that my joy might remain in you , which drave them out from his presence , as in a day , that he that sprinkleth the water of gall : for'