In [3]:
from math import log, exp
import pandas as pd
import nltk
from n_gram import *

Importing and Processing the Test and Train Data

In [4]:
filepath = "Dataset/Training/train_data.csv"
df_train = pd.read_csv(filepath)
df_train.size

263000

In [5]:
filepath = "Dataset/Testing/test_data.csv"
df_test = pd.read_csv(filepath)
df_test.size

65750

In [6]:
# Train Corpus
train_corpus = {}
sentence_id = 0
for sentence in df_train['Value']:
    tokenized_sentence = []
    tokens = nltk.word_tokenize(sentence)
    tokens = (token for token in tokens if token.isalpha())
    tokenized_sentence += tokens
    train_corpus[sentence_id] = tokenized_sentence
    sentence_id += 1

In [7]:
# Test Corpus
test_corpus = {} 
sentence_id = 0
for sentence in df_test['Value']:
    tokenized_sentence = []
    tokens = nltk.word_tokenize(sentence)
    tokens = (token for token in tokens if token.isalpha())
    tokenized_sentence += tokens
    test_corpus[sentence_id] = tokenized_sentence
    sentence_id += 1

In [8]:
# Defining Vocabulary

Vocab = vocabulary(train_corpus,test_corpus)
V = len(Vocab)
print(V)

72977


Unigram Model

In [9]:
prob_unigram = train_n_gram(train_corpus, 1)
pp, pp_unigram = test_n_gram(test_data=test_corpus,n=1, prob_words=prob_unigram)
print("Perplexity for Unigram model without smoothing:", pp_unigram)

Perplexity for Unigram model without smoothing: 10981259472.415056


Unigram Model with Laplace Smoothing

In [10]:
prob_unigram_sm = train_n_gram(train_corpus, 1, vocab = V, smoothing=True,how='Laplace')
pp, pp_unigram_sm = test_n_gram(test_data=test_corpus,n=1, prob_words=prob_unigram_sm, Vocabulary=V, smoothing=True, how='Laplace', processed_corpus=train_corpus)
print("Perplexity for Unigram model with Laplace smoothing:", pp_unigram_sm)

Perplexity for Unigram model with Laplace smoothing: 1288.081447441166


Unigram Model with Add-k Smoothing

In [12]:
# Optimized k from k_optimization notebook
prob_unigram_sm_addk = train_n_gram(train_corpus, 1, vocab = V, smoothing=True,how='Add_k',k=9.61)
pp, pp_unigram_sm_addk = test_n_gram(test_data=test_corpus,n=1, prob_words=prob_unigram_sm_addk, Vocabulary=V, smoothing=True, how='Add_k', k=9.61, processed_corpus=train_corpus)
print("Perplexity for Unigram model with Add-k smoothing:", pp_unigram_sm_addk, "at k =",9.61)

Perplexity for Unigram model with Add-k smoothing: 1090.37671485695 at k = 9.61


Unigram Model with Good Turing

In [13]:
prob_unigram_GT = train_n_gram(train_corpus, n=1, how='Good Turing')
pp, pp_unigram_GT = test_n_gram(test_data=test_corpus, n=1, prob_words=prob_unigram_GT, how='Good Turing')
print("Perplexity for Unigram model with Good Turing smoothing:", pp_unigram_GT)

Perplexity for Unigram model with Good Turing smoothing: 954.394309948336


Bigram Model

In [14]:
prob_bigram = train_n_gram(train_corpus, 2)
pp, pp_bigram = test_n_gram(test_data=test_corpus,n=2, prob_words=prob_bigram)
print("Perplexity for Bigram model without smoothing:", pp_bigram)

Perplexity for Bigram model without smoothing: 41282723511.56102


Bigram Model with Laplace Smoothing

In [15]:
prob_bigram_sm = train_n_gram(train_corpus, 2, vocab = V, smoothing=True, how='Laplace')
pp, pp_bigram_sm = test_n_gram(test_data=test_corpus,n=2, prob_words=prob_bigram_sm, Vocabulary=V, smoothing=True, how='Laplace', processed_corpus=train_corpus)
print("Perplexity for Bigram model with Laplace smoothing:", pp_bigram_sm)

Perplexity for Bigram model with Laplace smoothing: 1926.0717416683576


Bigram Model with Add-k Smoothing

In [29]:
prob_bigram_sm_addk = train_n_gram(train_corpus, 2, vocab = V, smoothing=True,how='Add_k',k=0.001)
pp, pp_bigram_sm_addk = test_n_gram(test_data=test_corpus,n=2, prob_words=prob_bigram_sm_addk, Vocabulary=V, smoothing=True, how='Add_k', k=0.001, processed_corpus=train_corpus)
print("Perplexity for Bigram model with Add-k smoothing:", pp_bigram_sm_addk)

Perplexity for Bigram model with Add-k smoothing: 480.4042686537863


Bigram Model with Good Turing

In [17]:
prob_bigram_GT = train_n_gram(train_corpus, n=2, how='Good Turing')
pp, pp_bigram_GT = test_n_gram(test_data=test_corpus, n=2, prob_words=prob_bigram_GT, how='Good Turing')
print("Perplexity for Bigram model with Good Turing smoothing:", pp_bigram_GT)

Perplexity for Bigram model with Good Turing smoothing: 25909.976748837806


Trigram Model

In [18]:
prob_trigram = train_n_gram(train_corpus, 3)
pp, pp_trigram = test_n_gram(test_data=test_corpus,n=3, prob_words=prob_trigram)
print("Perplexity for Trigram model without smoothing:", pp_trigram)

Perplexity for Trigram model without smoothing: 52285906912.19435


Trigram Model with Laplace Smoothing

In [19]:
prob_trigram_sm = train_n_gram(train_corpus, 3, vocab = V, smoothing=True, how='Laplace')
pp, pp_trigram_sm = test_n_gram(test_data=test_corpus,n=3, prob_words=prob_trigram_sm, Vocabulary=V, smoothing=True, how='Laplace', processed_corpus=train_corpus)
print("Perplexity for Trigram model with Laplace smoothing:", pp_trigram_sm)

Perplexity for Trigram model with Laplace smoothing: 4859.205008548903


Trigram Model with Add-k Smoothing

In [28]:
prob_trigram_sm_addk = train_n_gram(train_corpus, 3, vocab = V, smoothing=True,how='Add_k',k=2.3e-5)
pp, pp_trigram_sm_addk = test_n_gram(test_data=test_corpus,n=3, prob_words=prob_trigram_sm_addk, Vocabulary=V, smoothing=True, how='Add_k', k=2.3e-5, processed_corpus=train_corpus)
print("Perplexity for Trigram model with Add-k smoothing:", pp_trigram_sm_addk)

Perplexity for Trigram model with Add-k smoothing: 881.7829049234289


Trigram Model with Good Turing

In [21]:
prob_trigram_GT = train_n_gram(train_corpus, n=3, how='Good Turing')
pp, pp_trigram_GT = test_n_gram(test_data=test_corpus, n=3, prob_words=prob_trigram_GT, how='Good Turing')
print("Perplexity for Trigram model with Good Turing smoothing:", pp_trigram_GT)

Perplexity for Trigram model with Good Turing smoothing: 64541.13782619178


Quadgram Model

In [22]:
prob_quadgram = train_n_gram(train_corpus, 4)
pp, pp_quadgram = test_n_gram(test_data=test_corpus,n=4, prob_words=prob_quadgram)
print("Perplexity for Quadgram model without smoothing:", pp_quadgram)

Perplexity for Quadgram model without smoothing: 272609968826.675


Quadgram Model with Laplace Smoothing

In [23]:
prob_quadgram_sm = train_n_gram(train_corpus, 4, vocab = V, smoothing=True, how='Laplace')
pp, pp_quadgram_sm = test_n_gram(test_data=test_corpus,n=4, prob_words=prob_quadgram_sm, Vocabulary=V, smoothing=True, how='Laplace', processed_corpus=train_corpus)
print("Perplexity for Quadgram model with Laplace smoothing:", pp_quadgram_sm)

Perplexity for Quadgram model with Laplace smoothing: 5467.091269501292


Quadgram Model with Add-k Smoothing

In [27]:
prob_quadgram_sm_addk = train_n_gram(train_corpus, 4, vocab = V, smoothing=True,how='Add_k',k=5e-6)
pp, pp_quadgram_sm_addk = test_n_gram(test_data=test_corpus,n=4, prob_words=prob_quadgram_sm_addk, Vocabulary=V, smoothing=True, how='Add_k', k=5e-6, processed_corpus=train_corpus)
print("Perplexity for Quadgram model with Add-k smoothing:", pp_quadgram_sm_addk)

Perplexity for Quadgram model with Add-k smoothing: 1753.4454231039801


Quadgram Model with Good Turing

In [25]:
prob_quadgram_GT = train_n_gram(train_corpus, n=4, how='Good Turing')
pp, pp_quadgram_GT = test_n_gram(test_data=test_corpus, n=4, prob_words=prob_quadgram_GT, how='Good Turing')
print("Perplexity for Quadgram model with Good Turing smoothing:", pp_quadgram_GT)

Perplexity for Quadgram model with Good Turing smoothing: 90194.04162073249


Perplexities of different Models on training data

In [26]:

pp, pp_unigram_train = test_n_gram(test_data=train_corpus,n=1, prob_words=prob_unigram)
print("Perplexity for Unigram model on training data:", pp_unigram_train)

pp, pp_bigram_train = test_n_gram(test_data=train_corpus,n=2, prob_words=prob_bigram)
print("Perplexity for Bigram model on training data:", pp_bigram_train)

pp, pp_trigram_train = test_n_gram(test_data=train_corpus,n=3, prob_words=prob_trigram)
print("Perplexity for Trigram model on training data:", pp_trigram_train)

pp, pp_quadgram_train = test_n_gram(test_data=train_corpus,n=4, prob_words=prob_quadgram)
print("Perplexity for Quadgram model on training data:", pp_quadgram_train)

Perplexity for Unigram model on training data: 1238.2563332047428
Perplexity for Bigram model on training data: 76.79950776484705
Perplexity for Trigram model on training data: 8.636673482260596
Perplexity for Quadgram model on training data: 2.970771827821143
