In [None]:
from nltk.probability import (FreqDist, ConditionalFreqDist, ConditionalProbDist, MLEProbDist, SimpleGoodTuringProbDist)
from nltk.util import ngrams

In [None]:
def ml_estimator(freqdist):
    return MLEProbDist(freqdist)

def goodturing_estimator(freqdist):
    return SimpleGoodTuringProbDist(freqdist)

def read_file(file_path):
    words = []
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            line_tokens = line.lower().split(" ")
            words += line_tokens
    return words

# # use the generate method from the NLTK class ProbDistI to generate the next random word
def generate_text(ngram, n, length=100):
    # # add the padding start symbol to the init context
    context = tuple([ngram._start_symbol] * (n - 1))
    result = list(context)
    for i in range(length):
        if context in ngram._counter:
            prob_dist = ngram[context]
            # # predict the next word
            word = prob_dist.generate()
        else:
            word = ngram._end_symbol
        
        result.append(word)
        
        if word == ngram._end_symbol:
            break
        # # update the context    
        context = tuple(result[-(n-1):])
        
    return ' '.join(result)

In [None]:
class BasicNgram(ConditionalProbDist):
    """
    Define and train an Ngram Model over the corpus represented by the list words. 
    Given an BasicNgram instance ngram and a (n-1)-gram context (i.e., a tuple of n-1 strings), 
    a call to ngram[context] returns a nltk.probability.ProbDistI object representing the Probability distribution P(.|context) over possible values for the next word. 
    Be aware that context has to be a tuple, even if context is a unigram (see example below)
    
    >>> corpus=['a','b','b','a']
    >>> bigram=BasicNgram(2,corpus)
    >>> bigram.contexts()
    [('<$>',), ('a',), ('b',)]
    >>> p_b=bigram[('b',)] #not bigram['b']!!!
    >>> p_b.prob('a')
    0.5
    >>> p_b.prob('b')
    0.5
    
    :param n: the dimension of the n-grams (i.e. the size of the context+1).
    :type n: int
    :param corpus: 
    :type corpus: list(Str)
    
    other parameters are optional and may be omitted. They define whether to add artificial symbols before or after the word list, 
    and whether to use another estimation methods than maximum likelihood.
    """

    def __init__(self, n, words, start_symbol="<$>", end_symbol="</$>", pad_left=True, pad_right=False,
                 estimator=ml_estimator):
        assert (n > 0)
        self._n = n
        self._words = words
        self._counter = ConditionalFreqDist()
        self._start_symbol = start_symbol
        self._end_symbol = end_symbol
        self._pad_left = pad_left
        self._pad_right = pad_right
        self._train()
        super().__init__(self._counter, estimator)

    def _train(self):
        _ngrams = self.generate_ngrams()
        for ngram in _ngrams:
            context = ngram[0:-1]
            outcome = ngram[-1]
            self._counter[context][outcome] += 1

    """
    returns an iterable over the ngrams of the word corpus
    """

    def generate_ngrams(self):
        return ngrams(self._words, self._n, pad_left=self._pad_left, pad_right=self._pad_right,
                      left_pad_symbol=self._start_symbol,
                      right_pad_symbol=self._end_symbol)

    """                                                                                                                                                                                                                                                                                                                                                               
    Return the list of contexts                                                                                                                                                                                                                                                                                                                                       
    """

    def contexts(self):
        return list(self.conditions())

In [None]:
file_path = './kingjamesbible_tokenized.txt'

corpus = read_file(file_path)

print(corpus[:15])

In [None]:
# # 2-gram
bigram = BasicNgram(2, corpus)

bigram.contexts()[:10]

In [None]:
generate_text(bigram, 2, 100)

In [None]:
# # 3-gram
trigram = BasicNgram(3, corpus)

trigram.contexts()

In [None]:
generate_text(trigram, 3 ,100)

In [None]:
# # 4-gram
four_gram = BasicNgram(4, corpus)

four_gram.contexts()

In [None]:
generate_text(four_gram, 4, 100)