In [156]:
from n_grams import *
import numpy as np
from math import log, exp
import pandas as pd
import nltk


In [None]:
filepath = "Dataset/Training/train_data.csv"
df = pd.read_csv(filepath)
df.head()

In [118]:
df.rename(columns = {df.columns[0]:"Value"},inplace=True)

In [119]:
processed_corpus = {} 
sentence_id = 0
for sentence in df['Value']:
    tokenized_sentence = []
    tokens = nltk.word_tokenize(sentence)
    tokens = (token for token in tokens if token.isalpha())
    tokenized_sentence += tokens
    processed_corpus[sentence_id] = tokenized_sentence
    sentence_id += 1


In [None]:
print(processed_corpus[0])

Unigram Model

In [121]:
unigram_counts = {}
unigram_counts["<s>"] = 0
unigram_counts["</s>"] = 0
for id in processed_corpus:
    for token in processed_corpus[id]:
        if token not in unigram_counts:
            unigram_counts[token] = 0
        unigram_counts[token] += 1
    unigram_counts["<s>"] += 1
    unigram_counts["</s>"] += 1


In [122]:
len(unigram_counts)

66112

In [None]:
filepath = "Dataset/Testing/test_data.csv"
df_test = pd.read_csv(filepath)
df_test.head()

In [None]:
df_test.rename(columns = {df_test.columns[0]:"Value"},inplace=True)
df_test

In [125]:
test_corpus = {}
test_tokens = set()
sentence_id = 0
num_test_words = 0
for sentence in df_test['Value']:
    tokenized_sentence = []
    tokens = nltk.word_tokenize(sentence)
    tokens = (token for token in tokens if token.isalpha())
    tokenized_sentence += tokens
    test_corpus[sentence_id] = tokenized_sentence
    num_test_words += 2 + len(tokenized_sentence)
    for token in tokenized_sentence:
        test_tokens.add(token)
    sentence_id += 1

In [126]:
N_train = 0
for token in unigram_counts:
    N_train += unigram_counts[token]

In [None]:
len(test_tokens)

In [128]:
# calculating perplexity

prob_uni_words = {}
prob_uni_words["<s>"] = unigram_counts["<s>"] / N_train
prob_uni_words["</s>"] = unigram_counts["</s>"] / N_train
for token in test_tokens:
        if token in unigram_counts:
            prob_uni_words[token] = unigram_counts[token]/N_train
        else:
            prob_uni_words[token] = 0

In [129]:
out_of_vocab = 0
for i in prob_uni_words:
    if(prob_uni_words[i]==0):
        out_of_vocab+=1
out_of_vocab

6865

In [130]:
perplexity = {}
ep=1e-15
pp = 0
# We are neglecting the words that does not appear in the train data
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    temp = log(prob_uni_words["<s>"])
    for token in test_corpus[id]:
        if(prob_uni_words[token]==0):
            temp += log(prob_uni_words[token]+ep)
        else:
            temp += log(prob_uni_words[token])
    temp += log(prob_uni_words["</s>"])
    temp = (-temp)/N
    perplexity[id] = exp(temp)
    pp += perplexity[id]


In [None]:
maxi = 0
for i in range(len(perplexity)):

    maxi = max(perplexity[i],maxi)
    # print(perplexity[i])
print(maxi)


In [145]:
Unigram_pp = pp/len(test_corpus)
print("Perplexity of Unigram Model without smoothing is: ", Unigram_pp)

Perplexity of Unigram Model without smoothing is:  255029160.07513976


Unigram Model with Laplace Smoothing

In [133]:
vocab = set()
for id in processed_corpus:
    for token in processed_corpus[id]:
        vocab.add(token)
print(len(vocab))
for id in test_corpus:
    for token in test_corpus[id]:
        vocab.add(token)
print(len(vocab))
V = len(vocab) + 2

66110
72975


In [134]:
prob_uni_sm_words = {}
prob_uni_sm_words["<s>"] = (unigram_counts["<s>"] + 1) / (N_train + V)
prob_uni_sm_words["</s>"] = (unigram_counts["</s>"] + 1) / (N_train + V)
for token in test_tokens:
        if token in unigram_counts:
            prob_uni_sm_words[token] = (unigram_counts[token] + 1) / (N_train + V)
        else:
              prob_uni_sm_words[token] = 1 / (N_train + V)

In [154]:
perplexity_sm = {}
pp_sm = 0
# We are neglecting the words that does not appear in the train data
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    temp = log(prob_uni_sm_words["<s>"])
    for token in test_corpus[id]:
        temp += log(prob_uni_sm_words[token])
    temp += log(prob_uni_sm_words["</s>"])
    temp=(-temp)/N
    perplexity_sm[id] = exp(temp)
    pp_sm += perplexity_sm[id]

In [155]:
Unigram_sm_pp = pp_sm/len(test_corpus)
print("Perplexity of Unigram Model with Laplace Smoothing is: ", Unigram_sm_pp)

Perplexity of Unigram Model with Laplace Smoothing is:  1277.93160029543


Bigram Model

In [137]:
bigram_counts = {}
for id in processed_corpus:
    tokens = processed_corpus[id]
    size = len(tokens)
    if(("<s> "+tokens[0]) not in bigram_counts):
        bigram_counts["<s> "+ tokens[0]] = 0
    bigram_counts["<s> "+tokens[0]] += 1
    for i in range(0,size-1):
        str = tokens[i] + " " + tokens[i+1]
        if(str not in bigram_counts):
            bigram_counts[str] = 0
        bigram_counts[str] += 1
    if ((tokens[size-1] + " </s>") not in bigram_counts):
        bigram_counts[tokens[size-1] + " </s>"] = 0
    bigram_counts[tokens[size-1] + " </s>"]+= 1
    

In [138]:
len(bigram_counts)

871874

In [139]:
prob_bi_words = {}
for id in test_corpus:
    tokens = ["<s>"]
    tokens += test_corpus[id]
    tokens.append("</s>")
    size = len(tokens)
    for i in range(0,size-1):
        key = tokens[i]+ " "+ tokens[i+1]
        if key in bigram_counts:
            prob_bi_words[key] = bigram_counts[key] / unigram_counts[tokens[i]]
        else:
            prob_bi_words[key] = 0

    

In [146]:
bi_perplexity = {}
bi_pp = 0
ep=1e-15
# We are neglecting the words that does not appear in the train data
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    tokens = ["<s>"]
    tokens += test_corpus[id]
    tokens.append("</s>")
    size = len(tokens)
    temp = 0
    for i in range(0,size-1):
        str = tokens[i] + " " + tokens[i+1]
        if prob_bi_words[str]!=0:
            temp += log(prob_bi_words[str])
        else:
            temp += log(prob_bi_words[str]+ep)
    temp = (-temp)/N
    bi_perplexity[id] = exp(temp)
    bi_pp += bi_perplexity[id]

In [None]:

for i in range(len(bi_perplexity)):
    print(bi_perplexity[i])

In [148]:
Bigram_pp = bi_pp/len(test_corpus)
print("Perplexity of Bigram Model without smoothing is: ", Bigram_pp)

Perplexity of Bigram Model without smoothing is:  31069869118.23472


Bigram Model with Laplace Smoothing

In [151]:
prob_bi_sm_words = {}
for key in bigram_counts:
    w1,w2 = key.split(" ")
    if(key not in prob_bi_sm_words):
        prob_bi_sm_words[key] = (bigram_counts[key] + 1) / (unigram_counts[w1] + V)

In [152]:
bi_perplexity_sm = {}
bi_pp_sm = 0
# We are neglecting the words that does not appear in the train data
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    tokens = ["<s>"]
    tokens += test_corpus[id]
    tokens.append("</s>")
    size = len(tokens)
    temp = 0
    for i in range(0,size-1):
        str = tokens[i] + " " + tokens[i+1]
        if(str in prob_bi_sm_words):
            temp += log(prob_bi_sm_words[str])
        elif (tokens[i] in unigram_counts):
            p = 1 / (unigram_counts[tokens[i]] + V)
            temp += log(p)
        else:
            p = 1/V
            temp += log(p)
    temp = (-temp)/N
    bi_perplexity_sm[id] = exp(temp)
    bi_pp_sm += bi_perplexity_sm[id]

In [153]:
Bigram_sm_pp = bi_pp_sm/len(test_corpus)
print("Perplexity of Bigram Model with smoothing is: ", Bigram_sm_pp)

Perplexity of Bigram Model with smoothing is:  1985.1815331914747


Trigram Model

In [159]:
trigram_counts = {}
for id in processed_corpus:
    tokens = ["<s>", "<s>"]
    tokens += processed_corpus[id] + ["</s>", "</s>"]
    size = len(tokens)
    for i in range(0,size-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str not in trigram_counts):
            trigram_counts[str] = 0
        trigram_counts[str] += 1

In [160]:
print(len(trigram_counts))

2136765


In [161]:
prob_tri_words = {}
for key in trigram_counts:
    w1,w2,w3 = key.split(" ")
    prev = w1 + " " + w2
    if (prev in bigram_counts):
        prob_tri_words[key] = trigram_counts[key] / bigram_counts[prev]
    else:
        prob_tri_words[key] = trigram_counts[key] / unigram_counts[w2]
        

In [162]:
tri_perplexity = {}
tri_pp = 0
ep=1e-15
# We are neglecting the words that does not appear in the train data
for id in test_corpus:
    N = 4 + len(test_corpus[id])
    tokens = ["<s>", "<s>"]
    tokens += test_corpus[id] + ["</s>", "</s>"]
    size = len(tokens)
    temp = 0
    for i in range(0,size-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str in prob_tri_words):
            temp += log(prob_tri_words[str])
        else:
            temp +=log(ep)
    temp = (-temp)/N
    tri_perplexity[id] = exp(temp)
    tri_pp += tri_perplexity[id]

In [163]:
Trigram_pp = tri_pp/len(test_corpus)
print("Perplexity of Trigram Model without smoothing is: ", Trigram_pp)

Perplexity of Trigram Model without smoothing is:  69139995101.5171


Trigram Model with Laplace Smoothing

In [174]:
prob_tri_sm_words = {}
for key in trigram_counts:
    w1,w2,w3 = key.split(" ")
    prev = w1 + " " + w2
    if (prev in bigram_counts):
        prob_tri_words[key] = (trigram_counts[key] + 1) / (bigram_counts[prev] + V)
    else:
        prob_tri_words[key] = (trigram_counts[key] + 1) / (unigram_counts[w2] + V)

In [175]:
tri_perplexity_sm = {}
tri_pp_sm = 0
# We are neglecting the words that does not appear in the train data
for id in test_corpus:
    N = 4 + len(test_corpus[id])
    tokens = ["<s>", "<s>"]
    tokens += test_corpus[id] + ["</s>", "</s>"]
    size = len(tokens)
    temp = 0
    for i in range(0,size-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str in prob_tri_sm_words):
            temp += log(prob_tri_sm_words[str])
        else:
            prev = tokens[i] + " " + tokens[i+1]
            if (prev in bigram_counts):
                p = 1 /(bigram_counts[prev] + V)
                temp += log(p)
            elif (tokens[i+1] in unigram_counts):
                p = 1 /(unigram_counts[tokens[i+1]] + V)
                temp += log(p)
            else:
                p = 1/V
                temp += log(p)
    temp = (-temp)/N
    tri_perplexity_sm[id] = exp(temp)
    tri_pp_sm += tri_perplexity_sm[id]

In [176]:
Trigram_sm_pp = tri_pp_sm/len(test_corpus)
print("Perplexity of Trigram Model without smoothing is: ", Trigram_sm_pp)

Perplexity of Trigram Model without smoothing is:  20458.98937238894
