In [1]:
from IPython.display import clear_output

In [2]:
# %pip install nltk tqdm

clear_output()

# Content:

In this demo, we will build an N-gram probabilistic based Language model

We will use NLTK library to download the dataset and handle our text.

In [4]:
import random

import nltk
from nltk.util import ngrams as build_ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Downloading the dataset

In [5]:
# Download the IMDB dataset
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [7]:
tokenized_data = movie_reviews.sents()  # sents is sentences (not full reviews as can be very long). They are alread tokenized.

## Building the model

In [8]:
sos_token = '<SOS>'  # start of sentence token. Appending this at the start will make the selection of first token also probabilistic based according to corpus
eos_token = '<EOS>'  # to indicate a sentence has ended and we should stop generating

In [9]:
class NGramLanguageModel():

    def __init__(self, n):

        self.n = n
        self.word_freqs = defaultdict(dict) # 1 to n-1 grams: {dict of possible words: frequency}

    def train(self, sentences):

        for sentence in tqdm(sentences, desc='Processing Sentences'):

            for gram_size in range(2, self.n+1):  # 2 because we need to make key(gram[:-1]) val(gram[-1]) pairs and need atleast 2.

                ngrams = build_ngrams([sos_token]+sentence+[eos_token], gram_size)  # need to manually append eos_token at the end of sentences

                for ngram in ngrams:

                    key = ngram[:-1]
                    value = ngram[-1]

                    self.word_freqs[key][value] = self.word_freqs[key].get(value, 0)+1  # if key doesn't exist already then freq is 0. Whatever the frequency is, add 1 to it.

    def generate_sentence(self, starting_state=None, max_length=50):

        generated_sentence = []

        if starting_state is None:
            generated_sentence = [sos_token]
        elif isinstance(starting_state, str):
            generated_sentence = starting_state.split()

        if generated_sentence[0] != sos_token:
            generated_sentence = [sos_token]+generated_sentence

        max_key_len = self.n-1

        if tuple(generated_sentence[-1:]) not in self.word_freqs:  # python automatically takes care of the case if the max_key_len is bigger than total list size
            raise ValueError('Invalid starting state')

        while len(generated_sentence) <= max_length:

            for key_len in range(max_key_len, 0, -1):  # for loop for the condition: if we can't find a combination of lets say the latest 5 gram in corpus, we go for 4 then 3 and so on
                last_tokens = generated_sentence[-key_len:]
                next_word_freqs = self.word_freqs[tuple(last_tokens)]
                if len(next_word_freqs) > 0:
                    break

            words, freqs = list(zip(*next_word_freqs.items()))  # [words...], [freqs..]

            next_word = random.choices(words, weights=freqs, k=1)[0]  # no need to divide and convert to probability first. choices() can take weights as it is.
            generated_sentence.append(next_word)

            if next_word == eos_token:
                break

        return generated_sentence


In [17]:
model = NGramLanguageModel(n=5)  # The bigger the value of n, the better but bigger the model.

In [18]:
model.train(tokenized_data)

Processing Sentences: 100%|██████████| 71532/71532 [00:10<00:00, 6587.67it/s]


## Let's see the results

In [19]:
# 5 completely random sentences
for _ in range(5):
    sentence_tokens = model.generate_sentence()
    print(' '.join(sentence_tokens))  # skip SOS and EOS tokens
    print('-'*20)

<SOS> instead , he turns out to be a bad thing ? <EOS>
--------------------
<SOS> jeff has a good job , and the cast was left to carry the movie , which is frequently on display in lines like ` i ' m gonna start drinking again after this lame - ass movie . <EOS>
--------------------
<SOS> having not read the novel by thomas hardy , " jude the obscure , " his final novel ( final because this film created such an outrage that he never wrote again - see : i did do some research on it ) about a man who was being typecast
--------------------
<SOS> there are several subplots involving alan . <EOS>
--------------------
<SOS> nothing is what it seems in this reality , it ' s a rewarding experience . <EOS>
--------------------


In [24]:
model.generate_sentence(starting_state='I felt like')

['<SOS>',
 'I',
 'felt',
 'like',
 'a',
 'slap',
 'in',
 'the',
 'face',
 'to',
 'be',
 'hand',
 '-',
 'fed',
 'a',
 'theme',
 'in',
 'such',
 'a',
 'simplistic',
 'way',
 '.',
 '<EOS>']