# N-Gram Language Modeling

Here let's head into n-gram language modeling with common smoothing techniques.

In [None]:
!pip install nltk

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords

## prepare the corpa

In [None]:
with open("./materials/hamlet.txt", "r") as f:
    corpa = f.readlines()

In [None]:
def preprocess(sent: str):
    special_symbols_pattern = re.compile(r'[^a-zA-Z0-9\s]')
    mid_text = special_symbols_pattern \
        .sub(' ', sent.strip().strip("\n").lower()) \
        .split(" ")
    
    return [word 
            for word in mid_text 
            if word != "" and word not in stopwords.words('english')]

In [None]:
clean_corpa = list(filter(lambda x: len(x) != 0, map(preprocess, corpa)))

In [None]:
clean_corpa.__len__()

## Let's do n-gram now

In [None]:
class NGram:
    def __init__(self, n, corpa) -> None:
        self.n = n
        self.corpa = corpa

        self.vocab = set([word for sentence in self.corpa for word in sentence])

        self.cnt = np.zeros((len(self.vocab),) * self.n, dtype=np.uint16)
        self.prob = None
        self.w2i, self.i2w = self._build_vocab()

        self.cal_prob = \
            lambda eta: self.cnt / (np.sum(self.cnt, axis=-1, keepdims=True) +\
                                    eta)
    
    def _build_vocab(self):
        word2index = {word: index for index, word in enumerate(self.vocab)}
        index2word = {index: word for word, index in word2index.items()}

        return word2index, index2word
    
    def _laplace_sm(self, alpha):
        self.cnt += alpha
        self.prob = self.cal_prob(0.0)

    def _interpolation_sm(self, lbd):
        # n_minus_1 = NGram(self.n - 1, self.corpa)
        # n_minus_1.train()
        # self.prob += lbd * n_minus_1.prob[np.newaxis:]
        condition = self.cnt.sum(axis=-1, keepdims=True)
        lower_rank_cond = condition.sum(axis=0, keepdims=True)
        lower_rank_cnt = self.cnt.sum(axis=0, keepdims=True)

        self.prob = self.cal_prob(1e-8) + \
            lbd * lower_rank_cnt / lower_rank_cond

    def _good_turing(self):
        max_cnt = np.max(self.cnt)
        self.prob = np.zeros_like(self.cnt)

        for i in range(max_cnt):
            curr_idx = np.argwhere(self.cnt == i).T
            next_idx = np.argwhere(self.cnt == i + 1).T

            if curr_idx.size == 0:
                raise ZeroDivisionError(
                    f"No word occurs {i} times. Error, divided by zero.")

            self.prob[curr_idx[0], curr_idx[1]] = \
                (i + 1) * next_idx.shape[1] / curr_idx.shape[1]
        
        max_idx = np.argwhere(self.cnt == max_cnt).T
        self.prob[max_idx[0], max_idx[1]] = \
            (max_idx.shape[1] - 0.5) / np.sum(self.cnt)


    def train(self, smooth=None, laplace_alpha=1, lbd=0.5):
        for sentence in self.corpa:
            if len(sentence) >= self.n:
                for i in range(len(sentence) - self.n + 1):
                    ngram_idx = tuple([self.w2i[word] \
                                       for word in sentence[i:i+self.n]])
                    self.cnt[ngram_idx] +=1
        
        if smooth == "laplace":
            self._laplace_sm(alpha=laplace_alpha)
        elif smooth == "interpolation":
            self._interpolation_sm(lbd=lbd)
        elif smooth == "good-turing":
            self._good_turing()
        else:
            print("No valid smoothing is provided. \
                  Straightforward calculation.")
            self.prob = self.cal_prob(1e-8)

In [None]:
bigram = NGram(2, clean_corpa)
bigram.train(smooth="good-turing")

Oops, we encountered a divided-by-zero error. In the Good-Turing smoothing:

$$
P_r = \frac{(r + 1) \times N_{r+1}}{N_r} \\

P_{max} = \frac{r_{max} - 1/2}{N}
$$

when $N_r = 0$, it will be very problematic to handle this situation, which usually includes interpolation from higher rank events or back-off to lower rank language models.

In [None]:
bigram.prob

In [None]:
a = np.argwhere(bigram.prob > 0)

Using Numpy array to represents a language model is costly in memory, as they are quite sparse. Therefore, more memory efficient solutions should be used. Please refer [KenLM](https://github.com/kpu/kenlm), [SRILM](http://www.speech.sri.com/projects/srilm/).