# Exercise 1
1) Tìm hiểu phương pháp smoothing sử dụng trong mô hình ngôn ngữ, tập trung phương pháp Laplace

2) áp dụng mô hình ngôn ngữ 2-gram và 3-gram 

- Huấn luyện trên 1 tập corpus (tự thu thập)

- Tính xác suất của 1 câu đưa vào (có sử dụng smoothing)

Smoothing methods in language models are used to reduce the phenomenon of "zero probability" for words that the model has never seen during training.

## Laplace smoothing
Idea: To add a constant value to all the word counts in the dictionary. Specifically, we add one to all word counts, and then divide the numerator by the total number of possible words, including both the added words and the observed ones.

$$ P(w_i \mid \text{class}) = \frac{\text{freq}(w_i, \text{class} ) + 1}{(N_{\text{class}} + V)} $$

* N class = frequency of all words in class
* V = number of unique words in vocabulary


## Import Lib

In [127]:
# Lib
from utils import process_tweet, build_freqs
import nltk

import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from collections import Counter


### Note:
- nltk.FreqDist is the same Counter

## Preprocessing


In [128]:
SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"

def add_sentence_tokens(sentences, n):
    """Wrap each sentence in SOS and EOS tokens.

    For n >= 2, n-1 SOS tokens are added, otherwise only one is added.

    Args:
        sentences (list of str): the sentences to wrap.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        List of sentences with SOS and EOS tokens wrapped around them.

    """
    sos = SOS * (n-1) if n > 1 else SOS
    return ['{}{} {}'.format(sos, s, EOS) for s in sentences]

def replace_singletons(tokens):
    """Replace tokens which appear only once in the corpus with <UNK>.
    
    Args:
        tokens (list of str): the tokens comprising the corpus.
    Returns:
        The same list of tokens with each singleton replaced by <UNK>.
    
    """
    vocab = nltk.FreqDist(tokens)
    return [token if vocab[token] > 1 else UNK for token in tokens]

def preprocess_1(sentences, n): # it have function replace singletons 
    """Add SOS/EOS/UNK tokens to given sentences and tokenize.

    Args:
        sentences (list of str): the sentences to preprocess.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        The preprocessed sentences, tokenized by words.

    """
    sentences = add_sentence_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    tokens = replace_singletons(tokens)
    return tokens

def preprocess_2(sentences, n): #It haven't function replace sinletons
    """Add SOS/EOS/UNK tokens to given sentences and tokenize.

    Args:
        sentences (list of str): the sentences to preprocess.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        The preprocessed sentences, tokenized by words.

    """
    sentences = add_sentence_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    return tokens

In [129]:
def generate_ngrams(tokens, n):
    """Generate n-grams from a list of tokens."""
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

In [130]:
def build_ngram_model(tokens: list, n = 2): ## token , n -gram
    
    ngrams = list(nltk.ngrams(tokens, n))
    ngram_counts = Counter(ngrams)
    total_ngrams = len(ngram_counts)
    # Calculate Voca
    V = nltk.FreqDist(tokens)
    V_len = len(V)
    # print(ngram_counts)
    # Calculate probabilities
    ngram_probabilities = {ngram: (count+ 1) / (total_ngrams + V_len) for ngram, count in ngram_counts.items()}
    
    sorted_ngram_model = sorted(ngram_probabilities.items(), key=lambda x: x[1], reverse=True)
    return sorted_ngram_model 

def printNgram(ngram_model):
    for ngram, probability in ngram_model:
        print(f"{ngram}: {probability:.5f}")

## Load data

In [131]:
# path = "data/train.txt"
path = "train.txt"
with open(path, 'r') as f:
    train = [l.strip() for l in f.readlines()]

### Review

In [132]:
# train

In [133]:
# result_2gram = preprocess(train, 2)
# result_2gram
result_2gram = add_sentence_tokens(train, 2)
result_2gram

['<s> liberty all star usa sets initial payout </s>',
 '<s> we are being accused of not implementing this agreement </s>',
 '<s> entregrowth closed at 135 dlrs and options at 55 cents </s>',
 '<s> usda forecast south african 1986 87 corn exports at 210 mln tonnes vs 300 mln tonnes last month and 1985 86 exports at 275 mln tonnes vs 275 mln tonnes last month </s>',
 '<s> norgolds issued capital will be 2405 mln shares of which 63 pct will be held by nbh after 89 mln are issued to shareholders to raise 196 mln dlrs it said </s>',
 '<s> the april 6 sale to be evenly divided between the three and six month issues will result in a paydown of 165 billion dlrs as maturing bills total 1485 billion dlrs </s>',
 '<s> waste managements tender offer announced before the opening today expires march 25 </s>',
 '<s> he earlier estimated the damage from the us raid at about 500 mln dlrs </s>',
 '<s> brougher bigi to sell 40 pct of subsidiary </s>',
 '<s> that was not the case two years ago </s>',
 '<s

In [134]:
tokens = ' '.join(result_2gram).split(' ')
# tokens

In [135]:
vocab = nltk.FreqDist(tokens)
vocab

FreqDist({'<s>': 22, '</s>': 22, 'the': 22, 'mln': 12, 'and': 11, 'in': 10, 'of': 8, 'to': 8, 'not': 7, 'dlrs': 7, ...})

In [136]:
result_cleaned = preprocess_2(train, 2)
len(result_cleaned)
# result_cleaned

473

In [137]:
n_grams = nltk.ngrams(result_cleaned, 2)
n_vocab = nltk.FreqDist(n_grams)

m_grams = nltk.ngrams(result_cleaned, 1)
m_vocab = nltk.FreqDist(m_grams)

In [138]:
m_vocab

FreqDist({('<s>',): 22, ('</s>',): 22, ('the',): 22, ('mln',): 12, ('and',): 11, ('in',): 10, ('of',): 8, ('to',): 8, ('not',): 7, ('dlrs',): 7, ...})

In [139]:
n_vocab

FreqDist({('</s>', '<s>'): 21, ('<s>', 'the'): 6, ('mln', 'tonnes'): 4, ('will', 'be'): 3, ('exports', 'at'): 2, ('tonnes', 'vs'): 2, ('tonnes', 'last'): 2, ('last', 'month'): 2, ('275', 'mln'): 2, ('mln', 'dlrs'): 2, ...})

## Run 
### Module 2 gram

In [140]:
voca = nltk.FreqDist(result_cleaned)
type(voca)

nltk.probability.FreqDist

In [141]:
ngram_counts = Counter(result_cleaned)
len(ngram_counts)

263

In [142]:
ngram_model = build_ngram_model(result_cleaned, n = 2)
# # Print the 2-gram model
# ngram_model
printNgram(ngram_model)

('</s>', '<s>'): 0.03207
('<s>', 'the'): 0.01020
('mln', 'tonnes'): 0.00729
('will', 'be'): 0.00583
('exports', 'at'): 0.00437
('tonnes', 'vs'): 0.00437
('tonnes', 'last'): 0.00437
('last', 'month'): 0.00437
('275', 'mln'): 0.00437
('mln', 'dlrs'): 0.00437
('it', 'said'): 0.00437
('billion', 'dlrs'): 0.00437
('dlrs', '</s>'): 0.00437
('the', 'us'): 0.00437
('on', 'the'): 0.00437
('in', 'an'): 0.00437
('vs', 'not'): 0.00437
('not', 'given'): 0.00437
('given', 'net'): 0.00437
('corp', 'said'): 0.00437
('said', 'it'): 0.00437
('in', 'the'): 0.00437
('the', 'company'): 0.00437
('<s>', 'liberty'): 0.00292
('liberty', 'all'): 0.00292
('all', 'star'): 0.00292
('star', 'usa'): 0.00292
('usa', 'sets'): 0.00292
('sets', 'initial'): 0.00292
('initial', 'payout'): 0.00292
('payout', '</s>'): 0.00292
('<s>', 'we'): 0.00292
('we', 'are'): 0.00292
('are', 'being'): 0.00292
('being', 'accused'): 0.00292
('accused', 'of'): 0.00292
('of', 'not'): 0.00292
('not', 'implementing'): 0.00292
('implementing',

### Module 3 gram

In [143]:
ngram_model = build_ngram_model(result_cleaned, n = 3)
# # Print the 2-gram model
printNgram(ngram_model)

('</s>', '<s>', 'the'): 0.00972
('mln', 'tonnes', 'vs'): 0.00417
('mln', 'tonnes', 'last'): 0.00417
('tonnes', 'last', 'month'): 0.00417
('275', 'mln', 'tonnes'): 0.00417
('dlrs', '</s>', '<s>'): 0.00417
('vs', 'not', 'given'): 0.00417
('not', 'given', 'net'): 0.00417
('corp', 'said', 'it'): 0.00417
('<s>', 'the', 'company'): 0.00417
('<s>', 'liberty', 'all'): 0.00278
('liberty', 'all', 'star'): 0.00278
('all', 'star', 'usa'): 0.00278
('star', 'usa', 'sets'): 0.00278
('usa', 'sets', 'initial'): 0.00278
('sets', 'initial', 'payout'): 0.00278
('initial', 'payout', '</s>'): 0.00278
('payout', '</s>', '<s>'): 0.00278
('</s>', '<s>', 'we'): 0.00278
('<s>', 'we', 'are'): 0.00278
('we', 'are', 'being'): 0.00278
('are', 'being', 'accused'): 0.00278
('being', 'accused', 'of'): 0.00278
('accused', 'of', 'not'): 0.00278
('of', 'not', 'implementing'): 0.00278
('not', 'implementing', 'this'): 0.00278
('implementing', 'this', 'agreement'): 0.00278
('this', 'agreement', '</s>'): 0.00278
('agreement',

## Write OOP

In [144]:
# path = "data/train.txt"
path = "train.txt"
with open(path, 'r') as f:
    train = [l.strip() for l in f.readlines()]

In [145]:
class NGramLanguageModel(object):
    """An n-gram language model trained on a given corpus.
    
    For a given n and given training corpus, constructs an n-gram language
    model for the corpus by:
    1. preprocessing the corpus (adding SOS/EOS/UNK tokens)
    2. calculating (smoothed) probabilities for each n-gram

    Also contains methods for calculating the perplexity of the model
    against another corpus, and for generating sentences.

    Args:
        train_data (list of str): list of sentences comprising the training corpus.
        n (int): the order of language model to build (i.e. 1 for unigram, 2 for bigram, etc.).
        laplace (int): lambda multiplier to use for laplace smoothing (default 1 for add-1 smoothing).

    """
    def __init__(self, train_data, n, laplace=1) -> None:
        self.n = n
        self.laplace = laplace
        self.tokens = preprocess_2(train_data, n)
        self.vocab = nltk.FreqDist(self.tokens)  # word, times
        self.model = self._create_model()
    
    def _smooth(self):
        """Apply Laplace smoothing to n-gram frequency distribution.
        
        Here, n_grams refers to the n-grams of the tokens in the training corpus,
        while m_grams refers to the first (n-1) tokens of each n-gram.

        Returns:
            dict: Mapping of each n-gram (tuple of str) to its Laplace-smoothed 
            probability (float).

        """
        vocab_size = len(self.vocab)
        
        n_grams = nltk.ngrams(self.tokens, self.n)
        n_vocab = nltk.FreqDist(n_grams)
        
        m_grams = nltk.ngrams(self.tokens, self.n - 1)
        m_vocab = nltk.FreqDist(m_grams)
        
        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)
        
        return {n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items()}
        
    def _create_model(self):       
        """Create a probability distribution for the vocabulary of the training corpus.
        
        If building a unigram model, the probabilities are simple relative frequencies
        of each token with the entire corpus.

        Otherwise, the probabilities are Laplace-smoothed relative frequencies.

        Returns:
            A dict mapping each n-gram (tuple of str) to its probability (float).

        """
        if self.n == 1:
            num_tokens = len(self.tokens)
            return {(unigram,): count/ num_tokens for unigram, count in self.vocab.items()}
        else: 
            return self._smooth()
    
    

## Run code 
### Model 2, 3 Gram
- calcualte probabblity of corpus

In [146]:
lm_2gram = NGramLanguageModel(train, 2, 1)

lm_2gram.model


{('<s>', 'liberty'): 0.007017543859649123,
 ('liberty', 'all'): 0.007575757575757576,
 ('all', 'star'): 0.007547169811320755,
 ('star', 'usa'): 0.007575757575757576,
 ('usa', 'sets'): 0.007575757575757576,
 ('sets', 'initial'): 0.007575757575757576,
 ('initial', 'payout'): 0.007575757575757576,
 ('payout', '</s>'): 0.007575757575757576,
 ('</s>', '<s>'): 0.07719298245614035,
 ('<s>', 'we'): 0.007017543859649123,
 ('we', 'are'): 0.007575757575757576,
 ('are', 'being'): 0.007518796992481203,
 ('being', 'accused'): 0.007575757575757576,
 ('accused', 'of'): 0.007575757575757576,
 ('of', 'not'): 0.007380073800738007,
 ('not', 'implementing'): 0.007407407407407408,
 ('implementing', 'this'): 0.007575757575757576,
 ('this', 'agreement'): 0.007547169811320755,
 ('agreement', '</s>'): 0.007547169811320755,
 ('<s>', 'entregrowth'): 0.007017543859649123,
 ('entregrowth', 'closed'): 0.007575757575757576,
 ('closed', 'at'): 0.007575757575757576,
 ('at', '135'): 0.007434944237918215,
 ('135', 'dlrs'

#### 3 Gram

In [147]:
lm_3gram = NGramLanguageModel(train, 3, 1)

lm_3gram.model
 

{('<s>', '<s>', 'liberty'): 0.007017543859649123,
 ('<s>', 'liberty', 'all'): 0.007575757575757576,
 ('liberty', 'all', 'star'): 0.007575757575757576,
 ('all', 'star', 'usa'): 0.007575757575757576,
 ('star', 'usa', 'sets'): 0.007575757575757576,
 ('usa', 'sets', 'initial'): 0.007575757575757576,
 ('sets', 'initial', 'payout'): 0.007575757575757576,
 ('initial', 'payout', '</s>'): 0.007575757575757576,
 ('payout', '</s>', '<s>'): 0.007575757575757576,
 ('</s>', '<s>', '<s>'): 0.07746478873239436,
 ('<s>', '<s>', 'we'): 0.007017543859649123,
 ('<s>', 'we', 'are'): 0.007575757575757576,
 ('we', 'are', 'being'): 0.007575757575757576,
 ('are', 'being', 'accused'): 0.007575757575757576,
 ('being', 'accused', 'of'): 0.007575757575757576,
 ('accused', 'of', 'not'): 0.007575757575757576,
 ('of', 'not', 'implementing'): 0.007575757575757576,
 ('not', 'implementing', 'this'): 0.007575757575757576,
 ('implementing', 'this', 'agreement'): 0.007575757575757576,
 ('this', 'agreement', '</s>'): 0.0075