In [1]:
from nltk.probability import (FreqDist, ConditionalFreqDist, ConditionalProbDist, MLEProbDist, SimpleGoodTuringProbDist)
from nltk.util import ngrams

In [2]:
def ml_estimator(freqdist):
    return MLEProbDist(freqdist)


def goodturing_estimator(freqdist):
    return SimpleGoodTuringProbDist(freqdist)


def read_file(file_path):
    words = []
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            line_tokens = line.lower().split(" ")
            words += line_tokens
    return words


# # use the generate method from the NLTK class ProbDistI to generate the next random word
def generate_text(ngram, n, length=100):
    # # add the padding start symbol to the init context
    context = tuple([ngram._start_symbol] * (n - 1))
    result = list(context)
    for i in range(length):
        if context in ngram._counter:
            prob_dist = ngram[context]
            # # predict the next word
            word = prob_dist.generate()
        else:
            word = ngram._end_symbol

        result.append(word)

        if word == ngram._end_symbol:
            break
        # # update the context    
        context = tuple(result[-(n - 1):])

    return ' '.join(result)

In [3]:
class BasicNgram(ConditionalProbDist):
    """
    Define and train an Ngram Model over the corpus represented by the list words. 
    Given an BasicNgram instance ngram and a (n-1)-gram context (i.e., a tuple of n-1 strings), 
    a call to ngram[context] returns a nltk.probability.ProbDistI object representing the Probability distribution P(.|context) over possible values for the next word. 
    Be aware that context has to be a tuple, even if context is a unigram (see example below)
    
    >>> corpus=['a','b','b','a']
    >>> bigram=BasicNgram(2,corpus)
    >>> bigram.contexts()
    [('<$>',), ('a',), ('b',)]
    >>> p_b=bigram[('b',)] #not bigram['b']!!!
    >>> p_b.prob('a')
    0.5
    >>> p_b.prob('b')
    0.5
    
    :param n: the dimension of the n-grams (i.e. the size of the context+1).
    :type n: int
    :param corpus: 
    :type corpus: list(Str)
    
    other parameters are optional and may be omitted. They define whether to add artificial symbols before or after the word list, 
    and whether to use another estimation methods than maximum likelihood.
    """

    def __init__(self, n, words, start_symbol="<$>", end_symbol="</$>", pad_left=True, pad_right=False,
                 estimator=ml_estimator):
        assert (n > 0)
        self._n = n
        self._words = words
        self._counter = ConditionalFreqDist()
        self._start_symbol = start_symbol
        self._end_symbol = end_symbol
        self._pad_left = pad_left
        self._pad_right = pad_right
        self._train()
        super().__init__(self._counter, estimator)

    def _train(self):
        _ngrams = self.generate_ngrams()
        for ngram in _ngrams:
            context = ngram[0:-1]
            outcome = ngram[-1]
            self._counter[context][outcome] += 1

    """
    returns an iterable over the ngrams of the word corpus
    """

    def generate_ngrams(self):
        return ngrams(self._words, self._n, pad_left=self._pad_left, pad_right=self._pad_right,
                      left_pad_symbol=self._start_symbol,
                      right_pad_symbol=self._end_symbol)

    """                                                                                                                                                                                                                                                                                                                                                               
    Return the list of contexts                                                                                                                                                                                                                                                                                                                                       
    """

    def contexts(self):
        return list(self.conditions())

In [4]:
file_path = './kingjamesbible_tokenized.txt'

corpus = read_file(file_path)

# # avoid corpus is null
print(corpus[:15])

['in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.\n', 'and', 'the', 'earth', 'was']


In [5]:
# # 2-gram
bigram = BasicNgram(2, corpus)

bigram.contexts()[:10]

[('<$>',),
 ('in',),
 ('the',),
 ('beginning',),
 ('god',),
 ('created',),
 ('heaven',),
 ('and',),
 ('earth',),
 ('.\n',)]

In [6]:
generate_text(bigram, 2, 100)

'<$> in an high priest upon one man .\n and so many prophets four , cease .\n and the ground : yet learned wisdom of babylon shall be told you .\n the heaven ; and thy wife and with him to the daughter of arimathaea , by esaias prophesy again .\n for for they shall lift up axes and the disciples were both .\n for john saw jesus : then judgment seat , who can not one that the grass faileth for stubble .\n now and took every good cheer up to another of the son is a servant ; as'

# 2-gram

## **1.Coherence**:
**The 2-gram model produces output which is quite disjointed and lacks grammatical coherence. The system selects each word based on only one preceding word, which often leads to nonsensical or fragmented sentences.**

## **2.Creativity**:
**The text feels fragmented, with phrases and partial ideas rather than complete sentences. There are abrupt topic shifts, like going from "high priest" to "prophets" and then to "judgment seat."**

## **3.Quality**:
**Very low quality with unreadable text.**

In [7]:
# # 3-gram
trigram = BasicNgram(3, corpus)

trigram.contexts()[:10]

[('<$>', '<$>'),
 ('<$>', 'in'),
 ('in', 'the'),
 ('the', 'beginning'),
 ('beginning', 'god'),
 ('god', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the')]

# 3-gram
## **1.Coherence**:
**The 3-gram model shows some improvement in coherence. Here are more syntactically valid sequences and recognizable phrases.**

## **2.Creativity**:
**The output feels less random than the 2-gram model, but still displays some unexpected transitions.**

## **3.Quality**:
**While some phrases make sense individually, the text as a whole lacks logical flow. This model is closer to producing human-readable sentences, although the generated text still feels strange.**

In [8]:
generate_text(trigram, 3, 100)

'<$> <$> in the midst of thee .\n a man , named gamaliel , and his return from me .\n and he overlaid the bars thereof .\n and when joab heard the voice of the gershonites , in the city shall they be visited .\n then said jesus unto the going down to gihon .\n and moses said unto them ;\n thou shalt save thy people israel didst thou set the sea ; and in judgment .\n neither shall it be marvellous in mine own body , ye rejoice with all that their power is given unto it , and on him'

In [9]:
# # 4-gram
four_gram = BasicNgram(4, corpus)

four_gram.contexts()[:10]

[('<$>', '<$>', '<$>'),
 ('<$>', '<$>', 'in'),
 ('<$>', 'in', 'the'),
 ('in', 'the', 'beginning'),
 ('the', 'beginning', 'god'),
 ('beginning', 'god', 'created'),
 ('god', 'created', 'the'),
 ('created', 'the', 'heaven'),
 ('the', 'heaven', 'and'),
 ('heaven', 'and', 'the')]

In [10]:
generate_text(four_gram, 4, 100)

'<$> <$> <$> in the beginning of barley harvest .\n and naomi said unto her , weep not : behold , i will break the staff of the bread of the increase of thy kine , and tied them to the border of arnon , but came not within the days appointed , and the sinite ,\n and the sockets thereof , and joined the foundations .\n be it known unto you , i have not proved them . and the king sent jehucal the son of jabesh in samaria , and unto all riches of the glory of the lord , and'

# 4-gram
## **1.Coherence**:
**The 4-gram model produces text that closely resembles human language in terms of sentence structure. Sequences like "and naomi said unto her, weep not" and "i will break the staff of the bread of the increase of thy kine" are almost plausible biblical phrases, albeit somewhat verbose.**

## **2.Creativity**:
**This model sacrifices some randomness, resulting in more readable text.**

## **3.Quality**:
**The quality of the generated text is the highest among these three n-gram model, with sentences that are more grammatically and semantically plausible.**