In [1]:
import re
import math
import random
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def get_data_sentences():

    with open(r'/content/drive/MyDrive/Colab Notebooks/clean_data.txt', 'r+', encoding='utf-8') as file:
        data = file.readlines()
        file.close()
        
    return data

In [4]:
def get_tokenized_data(sentences):

    tokenized_data = []

    for sentence in sentences:
        tokenized_data.append(sentence.split(' '))
        
    return tokenized_data

In [5]:
def count_words(tokenized_sentences):   

    word_counts = {}

    for sentence in tokenized_sentences:

        for token in sentence:

            if token not in word_counts:
                word_counts[token] = 1
            else:
                word_counts[token] += 1
                
    return word_counts

In [6]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):

    closed_vocab = []
    word_counts = count_words(tokenized_sentences)

    for word, cnt in word_counts.items():

        if cnt >= count_threshold:
            closed_vocab.append(word)
            
    return closed_vocab

In [37]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):

    vocabulary = set(vocabulary)
    new_tokenized_sentences = []

    for sentence in tokenized_sentences:

        new_sentence = []
        for token in sentence:

            if token in vocabulary:

                new_sentence.append(token)

            else:

                new_sentence.append(unknown_token)

        new_tokenized_sentences.append(new_sentence)

    return new_tokenized_sentences

In [38]:
def preprocess_data(tokenized_data, count_threshold=1, unknown_token="<unk>"):

    vocabulary = get_words_with_nplus_frequency(tokenized_data, count_threshold)

    train_data = replace_oov_words_by_unk(tokenized_data, vocabulary, unknown_token)

    return train_data, vocabulary

In [9]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):

    n_grams = {}

    for sentence in data:

        sentence = [start_token]*n + sentence + [end_token]
        sentence = tuple(sentence)
        m = len(sentence) if n == 1 else len(sentence) - n + 1

        for i in range(m): 

            n_gram = sentence[i:i+n]
            if n_gram in n_grams.keys():
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
                
    return n_grams

In [10]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1):

    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0

    denominator = previous_n_gram_count + k * vocabulary_size

    n_plus1_gram = previous_n_gram + (word,) 
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0

    numerator = n_plus1_gram_count + k

    probability = numerator / denominator

    return probability

In [11]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>",  k=1):

    previous_n_gram = tuple(previous_n_gram)    
    vocabulary = vocabulary + [end_token, unknown_token]    
    vocabulary_size = len(vocabulary)   

    probabilities = {}

    for word in vocabulary:

        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [12]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, start_token='<s>', end_token = '<e>', k=1):

    n = len(list(n_gram_counts.keys())[0]) 

    sentence = [start_token] * n + sentence + [end_token]
    sentence = tuple(sentence)

    N = len(sentence)
    product_pi = 1.0

    for t in range(n, N):

        n_gram = sentence[t-n:t]
        word = sentence[t]
        probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k)
        product_pi *= (1 / probability)

    perplexity = (product_pi)**(1/N)
    
    return perplexity

In [13]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>", k=1, start_with=None):

    n = len(list(n_gram_counts.keys())[0]) 

    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k)

    suggestion = None
    max_prob = 0

    for word, prob in probabilities.items():

        if start_with:
            if not word.startswith(start_with):
                continue  
                
        if prob > max_prob:
            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [14]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1, start_with=None):

    model_counts = len(n_gram_counts_list)
    suggestions = []

    for i in range(model_counts-1):

        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k, start_with=start_with)
        suggestions.append(suggestion)

    return suggestions

In [26]:
def print_suggetions(suggestions):

  print("Suggestion: ")

  for word in suggestions:
    
    print(word)

In [40]:
data = get_data_sentences()
tokenized_data = get_tokenized_data(data)

random.shuffle(tokenized_data)

train_data, vocabulary = preprocess_data(tokenized_data, 10)

In [21]:
n_gram_counts_list = []

for n in range(1, 7):

    print("n-grams =", n)
    
    n_model_counts = count_n_grams(train_data, n)
    n_gram_counts_list.append(n_model_counts)

n-grams = 1
n-grams = 2
n-grams = 3
n-grams = 4
n-grams = 5
n-grams = 6


In [28]:
tokens_1 = ["đường", "ống", "nord"]
suggest_1 = get_suggestions(tokens_1, n_gram_counts_list[:len(tokens_1) + 1], vocabulary, k=1)

print("Previous words:", tokens_1)
print_suggetions(suggest_1)

for n in range(len(tokens_1)):
  
  perplexity = calculate_perplexity(tokens_1, n_gram_counts_list[n], n_gram_counts_list[n + 1], len(vocabulary), k=1)

  print(f"Perplexity with n-grams = {n + 1}: {perplexity}")

Previous words: ['đường', 'ống', 'nord']
Suggestion: 
('stream', 0.046193835870776086)
('stream', 0.018302956042272937)
('stream', 0.018227770544535497)
Perplexity with n-grams = 1: 134.43086588964763
Perplexity with n-grams = 2: 113.06874118862112
Perplexity with n-grams = 3: 125.05429531292415


In [29]:
tokens_2 = ["vũ", "khí"]
suggest_2 = get_suggestions(tokens_2, n_gram_counts_list[:len(tokens_2) + 1], vocabulary, k=1)

print("Previous words:", tokens_2)
print_suggetions(suggest_2)

for n in range(len(tokens_2)):
  
  perplexity = calculate_perplexity(tokens_2, n_gram_counts_list[n], n_gram_counts_list[n + 1], len(vocabulary), k=1)

  print(f"Perplexity with n-grams = {n + 1}: {perplexity}")

Previous words: ['vũ', 'khí']
Suggestion: 
('đốt', 0.0658965896589659)
('hạt', 0.026947984123668268)
Perplexity with n-grams = 0: 178.29285477102198
Perplexity with n-grams = 1: 132.80030280497326


In [30]:
tokens_3 = ["từ", "trường"]
suggest_3 = get_suggestions(tokens_3, n_gram_counts_list[:len(tokens_3) + 1], vocabulary, k=1)

print("Previous words:", tokens_3)
print_suggetions(suggest_3)

for n in range(len(tokens_3)):
  
  perplexity = calculate_perplexity(tokens_3, n_gram_counts_list[n], n_gram_counts_list[n + 1], len(vocabulary), k=1)

  print(f"Perplexity with n-grams = {n + 1}: {perplexity}")

Previous words: ['từ', 'trường']
Suggestion: 
('hợp', 0.05799509156126109)
('trái', 0.0019337871287128713)
Perplexity with n-grams = 1: 202.51201140107563
Perplexity with n-grams = 2: 95.38858738420113


In [31]:
tokens_4 = ["thổ", 'nhĩ']
suggest_4 = get_suggestions(tokens_4, n_gram_counts_list[:len(tokens_4) + 1], vocabulary, k=1)

print("Previous words:", tokens_4)
print_suggetions(suggest_4)

for n in range(len(tokens_4)):
  
  perplexity = calculate_perplexity(tokens_4, n_gram_counts_list[n], n_gram_counts_list[n + 1], len(vocabulary), k=1)

  print(f"Perplexity with n-grams = {n + 1}: {perplexity}")

Previous words: ['thổ', 'nhĩ']
Suggestion: 
('kỳ', 0.08266685540377947)
('kỳ', 0.08010910918096333)
Perplexity with n-grams = 1: 193.95820780900857
Perplexity with n-grams = 2: 136.97050638722203


In [32]:
tokens_5 = ["nơi", "dễ", "tổn"]
suggest_5 = get_suggestions(tokens_5, n_gram_counts_list[:len(tokens_5) + 1], vocabulary, k=1)

print("Previous words:", tokens_5)
print_suggetions(suggest_5)

for n in range(len(tokens_5)):
  
  perplexity = calculate_perplexity(tokens_5, n_gram_counts_list[n], n_gram_counts_list[n + 1], len(vocabulary), k=1)

  print(f"Perplexity with n-grams = {n + 1}: {perplexity}")

Previous words: ['nơi', 'dễ', 'tổn']
Suggestion: 
('thương', 0.02998016018811081)
('thương', 0.001713796058269066)
('trong', 7.803964413922273e-05)
Perplexity with n-grams = 1: 606.9389615801686
Perplexity with n-grams = 2: 416.3179354514162
Perplexity with n-grams = 3: 175.82915457033627
