In [253]:
from math import log, exp
import pandas as pd
import nltk


In [254]:
filepath = "Dataset/Training/train_data.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,0
0,seriously ask any woman they will tell you thi...
1,in us of taxpayers paid tax
2,haha russia go boom splat india go ahhh plop
3,can t say i m surprised
4,they use the religious label for social contro...


In [255]:
df.rename(columns = {df.columns[0]:"Value"},inplace=True)

In [256]:
processed_corpus = {} 
sentence_id = 0
for sentence in df['Value']:
    tokenized_sentence = []
    tokens = nltk.word_tokenize(sentence)
    tokens = (token for token in tokens if token.isalpha())
    tokenized_sentence += tokens
    processed_corpus[sentence_id] = tokenized_sentence
    sentence_id += 1


In [257]:
print(processed_corpus[0])

['seriously', 'ask', 'any', 'woman', 'they', 'will', 'tell', 'you', 'this', 'is', 'all', 'very', 'common']


Unigram Model

In [258]:
unigram_counts = {}
unigram_counts["<s>"] = 0
unigram_counts["</s>"] = 0
for id in processed_corpus:
    for token in processed_corpus[id]:
        if token not in unigram_counts:
            unigram_counts[token] = 0
        unigram_counts[token] += 1
    unigram_counts["<s>"] += 1
    unigram_counts["</s>"] += 1


In [259]:
len(unigram_counts)

66112

In [260]:
filepath = "Dataset/Testing/test_data.csv"
df_test = pd.read_csv(filepath)
df_test.head()

Unnamed: 0,0
0,actually that s a really amazing idea mashaallah
1,the chinese fella was trying to take him on li...
2,i honestly think international troops will be ...
3,marry man s best friend if the man isn t good ...
4,if climate change will submerge all those plac...


In [261]:
df_test.rename(columns = {df_test.columns[0]:"Value"},inplace=True)
df_test

Unnamed: 0,Value
0,actually that s a really amazing idea mashaallah
1,the chinese fella was trying to take him on li...
2,i honestly think international troops will be ...
3,marry man s best friend if the man isn t good ...
4,if climate change will submerge all those plac...
...,...
65745,because the sky is high and so am i
65746,view link info feedback for savevideo donate d...
65747,fortunately the court did the right thing desp...
65748,yeah they kept improving man it was a nervous ...


In [262]:
test_corpus = {}
test_tokens = set()
sentence_id = 0
num_test_words = 0
for sentence in df_test['Value']:
    tokenized_sentence = []
    tokens = nltk.word_tokenize(sentence)
    tokens = (token for token in tokens if token.isalpha())
    tokenized_sentence += tokens
    test_corpus[sentence_id] = tokenized_sentence
    num_test_words += 2 + len(tokenized_sentence)
    for token in tokenized_sentence:
        test_tokens.add(token)
    sentence_id += 1

In [263]:
N_train = 0
for token in unigram_counts:
    N_train += unigram_counts[token]

In [264]:
len(test_tokens)

35292

In [265]:
# calculating perplexity

prob_uni_words = {}
prob_uni_words["<s>"] = unigram_counts["<s>"] / N_train
prob_uni_words["</s>"] = unigram_counts["</s>"] / N_train
for token in test_tokens:
        if token in unigram_counts:
            prob_uni_words[token] = unigram_counts[token]/N_train
        else:
            prob_uni_words[token] = 0

In [266]:
out_of_vocab = 0
for i in prob_uni_words:
    if(prob_uni_words[i]==0):
        out_of_vocab+=1
out_of_vocab

6865

In [267]:
perplexity = {}
ep=1e-15
pp = 0
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    temp = log(prob_uni_words["<s>"])
    for token in test_corpus[id]:
        if(prob_uni_words[token]==0):
            temp += log(prob_uni_words[token]+ep)
        else:
            temp += log(prob_uni_words[token])
    temp += log(prob_uni_words["</s>"])
    temp = (-temp)/N
    perplexity[id] = exp(temp)
    pp += perplexity[id]


In [268]:
Unigram_pp = pp/len(test_corpus)
print("Perplexity of Unigram Model without smoothing is: ", Unigram_pp)

Perplexity of Unigram Model without smoothing is:  255029160.07513976


Unigram Model with Laplace Smoothing

In [269]:
vocab = set()
for id in processed_corpus:
    for token in processed_corpus[id]:
        vocab.add(token)
print(len(vocab))
for id in test_corpus:
    for token in test_corpus[id]:
        vocab.add(token)
print(len(vocab))
V = len(vocab) + 2

66110
72975


In [270]:
prob_uni_sm_words = {}
prob_uni_sm_words["<s>"] = (unigram_counts["<s>"] + 1) / (N_train + V)
prob_uni_sm_words["</s>"] = (unigram_counts["</s>"] + 1) / (N_train + V)
for token in test_tokens:
        if token in unigram_counts:
            prob_uni_sm_words[token] = (unigram_counts[token] + 1) / (N_train + V)
        else:
              prob_uni_sm_words[token] = 1 / (N_train + V)

In [271]:
perplexity_sm = {}
pp_sm = 0
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    temp = log(prob_uni_sm_words["<s>"])
    for token in test_corpus[id]:
        temp += log(prob_uni_sm_words[token])
    temp += log(prob_uni_sm_words["</s>"])
    temp=(-temp)/N
    perplexity_sm[id] = exp(temp)
    pp_sm += perplexity_sm[id]

In [272]:
Unigram_sm_pp = pp_sm/len(test_corpus)
print("Perplexity of Unigram Model with Laplace Smoothing is: ", Unigram_sm_pp)

Perplexity of Unigram Model with Laplace Smoothing is:  1277.93160029543


Bigram Model

In [273]:
bigram_counts = {}
for id in processed_corpus:
    tokens = processed_corpus[id]
    size = len(tokens)
    if(("<s> "+tokens[0]) not in bigram_counts):
        bigram_counts["<s> "+ tokens[0]] = 0
    bigram_counts["<s> "+tokens[0]] += 1
    for i in range(0,size-1):
        str = tokens[i] + " " + tokens[i+1]
        if(str not in bigram_counts):
            bigram_counts[str] = 0
        bigram_counts[str] += 1
    if ((tokens[size-1] + " </s>") not in bigram_counts):
        bigram_counts[tokens[size-1] + " </s>"] = 0
    bigram_counts[tokens[size-1] + " </s>"]+= 1
    

In [274]:
len(bigram_counts)

871874

In [275]:
prob_bi_words = {}
for id in test_corpus:
    tokens = ["<s>"]
    tokens += test_corpus[id]
    tokens.append("</s>")
    size = len(tokens)
    for i in range(0,size-1):
        key = tokens[i]+ " "+ tokens[i+1]
        if key in bigram_counts:
            prob_bi_words[key] = bigram_counts[key] / unigram_counts[tokens[i]]
        else:
            prob_bi_words[key] = 0

    

In [276]:
bi_perplexity = {}
bi_pp = 0
ep=1e-15
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    tokens = ["<s>"]
    tokens += test_corpus[id]
    tokens.append("</s>")
    size = len(tokens)
    temp = 0
    for i in range(0,size-1):
        str = tokens[i] + " " + tokens[i+1]
        if prob_bi_words[str]!=0:
            temp += log(prob_bi_words[str])
        else:
            temp += log(prob_bi_words[str]+ep)
    temp = (-temp)/N
    bi_perplexity[id] = exp(temp)
    bi_pp += bi_perplexity[id]

In [277]:
Bigram_pp = bi_pp/len(test_corpus)
print("Perplexity of Bigram Model without smoothing is: ", Bigram_pp)

Perplexity of Bigram Model without smoothing is:  31069869118.23472


Bigram Model with Laplace Smoothing

In [278]:
prob_bi_sm_words = {}
for key in bigram_counts:
    w1,w2 = key.split(" ")
    if(key not in prob_bi_sm_words):
        prob_bi_sm_words[key] = (bigram_counts[key] + 1) / (unigram_counts[w1] + V)

In [279]:
bi_perplexity_sm = {}
bi_pp_sm = 0
for id in test_corpus:
    N = 2 + len(test_corpus[id])
    tokens = ["<s>"]
    tokens += test_corpus[id]
    tokens.append("</s>")
    size = len(tokens)
    temp = 0
    for i in range(0,size-1):
        str = tokens[i] + " " + tokens[i+1]
        if(str in prob_bi_sm_words):
            temp += log(prob_bi_sm_words[str])
        elif (tokens[i] in unigram_counts):
            p = 1 / (unigram_counts[tokens[i]] + V)
            temp += log(p)
        else:
            p = 1/V
            temp += log(p)
    temp = (-temp)/N
    bi_perplexity_sm[id] = exp(temp)
    bi_pp_sm += bi_perplexity_sm[id]

In [280]:
Bigram_sm_pp = bi_pp_sm/len(test_corpus)
print("Perplexity of Bigram Model with smoothing is: ", Bigram_sm_pp)

Perplexity of Bigram Model with smoothing is:  1985.1815331914747


Trigram Model

In [281]:
trigram_counts = {}
for id in processed_corpus:
    tokens = ["<s>", "<s>"]
    tokens += processed_corpus[id] + ["</s>", "</s>"]
    size = len(tokens)
    for i in range(0,size-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str not in trigram_counts):
            trigram_counts[str] = 0
        trigram_counts[str] += 1

In [282]:
print(len(trigram_counts))

2136765


In [283]:
prob_tri_words = {}
for key in trigram_counts:
    w1,w2,w3 = key.split(" ")
    prev = w1 + " " + w2
    if (prev in bigram_counts):
        prob_tri_words[key] = trigram_counts[key] / bigram_counts[prev]
    else:
        prob_tri_words[key] = trigram_counts[key] / unigram_counts[w2]
        

In [284]:
tri_perplexity = {}
tri_pp = 0
ep=1e-15
for id in test_corpus:
    N = 4 + len(test_corpus[id])
    tokens = ["<s>", "<s>"]
    tokens += test_corpus[id] + ["</s>", "</s>"]
    size = len(tokens)
    temp = 0
    for i in range(0,size-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str in prob_tri_words):
            temp += log(prob_tri_words[str])
        else:
            temp +=log(ep)
    temp = (-temp)/N
    tri_perplexity[id] = exp(temp)
    tri_pp += tri_perplexity[id]

In [285]:
Trigram_pp = tri_pp/len(test_corpus)
print("Perplexity of Trigram Model without smoothing is: ", Trigram_pp)

Perplexity of Trigram Model without smoothing is:  69139995101.5171


Trigram Model with Laplace Smoothing

In [286]:
prob_tri_sm_words = {}
for key in trigram_counts:
    w1,w2,w3 = key.split(" ")
    prev = w1 + " " + w2
    if (prev in bigram_counts):
        prob_tri_sm_words[key] = (trigram_counts[key] + 1) / (bigram_counts[prev] + V)
    else:
        prob_tri_sm_words[key] = (trigram_counts[key] + 1) / (unigram_counts[w2] + V)

In [287]:
tri_perplexity_sm = {}
tri_pp_sm = 0
for id in test_corpus:
    tokens = ["<s>", "<s>"]
    tokens += test_corpus[id] + ["</s>", "</s>"]
    N = len(tokens)
    temp = 0
    for i in range(0,N-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str in prob_tri_sm_words):
            temp += log(prob_tri_sm_words[str])
        else:
            prev = tokens[i] + " " + tokens[i+1]
            if (prev in bigram_counts):
                p = 1 /(bigram_counts[prev] + V)
            elif (tokens[i+1] in unigram_counts):
                p = 1 /(unigram_counts[tokens[i+1]] + V)
            else:
                p = 1/V
            temp += log(p)
    temp = (-temp)/N
    tri_perplexity_sm[id] = exp(temp)
    tri_pp_sm += tri_perplexity_sm[id]

In [288]:
Trigram_sm_pp = tri_pp_sm/len(test_corpus)
print("Perplexity of Trigram Model is: ", Trigram_sm_pp)

Perplexity of Trigram Model is:  4955.669477563053


Quadgram

In [289]:
quadgram_counts = {}
for id in processed_corpus:
    tokens = ["<s>", "<s>", "<s>"]
    tokens += processed_corpus[id] + ["</s>", "</s>", "</s>"]
    size = len(tokens)
    for i in range(0,size-3):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] + " " + tokens[i+3]
        if(str not in quadgram_counts):
            quadgram_counts[str] = 0
        quadgram_counts[str] += 1

In [290]:
print(len(quadgram_counts))

2910767


In [291]:
prob_quad_words = {}
for key in quadgram_counts:
    w1,w2,w3,w4 = key.split(" ")
    prev = w1 + " " + w2 + " " + w3
    if (prev in trigram_counts):
        prob_quad_words[key] = quadgram_counts[key] / trigram_counts[prev]
    else:
        prob_quad_words[key] = quadgram_counts[key] / unigram_counts[w3]

In [292]:
quad_perplexity = {}
quad_pp = 0
ep=1e-15
for id in test_corpus:
    tokens = ["<s>", "<s>","<s>"]
    tokens += test_corpus[id] + ["</s>", "</s>", "</s>"]
    N = len(tokens)
    temp = 0
    for i in range(0,N-3):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] + " " + tokens[i+3]
        if(str in prob_quad_words):
            temp += log(prob_quad_words[str])
        else:
            temp +=log(ep)
    temp = (-temp)/N
    quad_perplexity[id] = exp(temp)
    quad_pp += quad_perplexity[id]

In [293]:
Quadgram_pp = quad_pp/len(test_corpus)
print("Perplexity of Quadgram Model without smoothing is: ", Quadgram_pp)

Perplexity of Quadgram Model without smoothing is:  292224555035.2341


Quadgram Model with Laplace Smoothing

In [294]:
prob_quad_sm_words = {}
for key in quadgram_counts:
    w1,w2,w3,w4 = key.split(" ")
    prev = w1 + " " + w2 + " " + w3
    if (prev in trigram_counts):
        prob_quad_sm_words[key] = (quadgram_counts[key] + 1) / (trigram_counts[prev] + V)
    else:
        prob_quad_sm_words[key] = (quadgram_counts[key] + 1) / (unigram_counts[w3] + V)


In [295]:
quad_perplexity_sm = {}
quad_pp_sm = 0

for id in test_corpus:
    tokens = ["<s>", "<s>", "<s>"]
    tokens += test_corpus[id] + ["</s>", "</s>", "</s>"]
    N = len(tokens)
    temp = 0
    for i in range(0,N-3):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] + " " + tokens[i+3]
        if(str in prob_quad_sm_words):
            temp += log(prob_quad_sm_words[str])
        else:
            prev = tokens[i] + " " + tokens[i+1] + " " +tokens[i+2]
            if (prev in trigram_counts):
                p = 1 /(trigram_counts[prev] + V)
            elif ((tokens[i+1]+" " + tokens[i+2]) in bigram_counts):
                p = 1 /(bigram_counts[(tokens[i+1]+" " + tokens[i+2])] + V)
            elif (tokens[i+2] in unigram_counts):
                p = 1/(unigram_counts[tokens[i+2]] + V)
            else:
                p = 1/V
            temp += log(p)
    temp = (-temp)/N
    quad_perplexity_sm[id] = exp(temp)
    quad_pp_sm += quad_perplexity_sm[id]

In [296]:
Quadgram_sm_pp = quad_pp_sm/len(test_corpus)
print("Perplexity of Quadgram Model with smoothing is: ", Quadgram_sm_pp)

Perplexity of Quadgram Model with smoothing is:  5563.028169800954


Unigram On Train Data

In [297]:
V_train=66110+2

In [298]:
train_prob_uni_sm_words = {}
for token in unigram_counts:
    train_prob_uni_sm_words[token] = (unigram_counts[token] + 1) / (N_train + V_train)

In [299]:
p=0
for token in unigram_counts:
    p+=log(train_prob_uni_sm_words[token])
p=(-p)/len(unigram_counts)
p=exp(p)
p

855095.4860045433

In [300]:
train_uni_perplexity_sm = {}
train_uni_pp_sm = 0
for id in processed_corpus:
    N = 2 + len(processed_corpus[id])
    temp = log(train_prob_uni_sm_words["<s>"])
    for token in processed_corpus[id]:
        temp += log(train_prob_uni_sm_words[token])
    temp += log(train_prob_uni_sm_words["</s>"])
    temp=(-temp)/N
    train_uni_perplexity_sm[id] = exp(temp)
    train_uni_pp_sm += train_uni_perplexity_sm[id]

In [301]:
train_Unigram_perplexity = train_uni_pp_sm/len(processed_corpus)
print("Perplexity of Unigram Model on train data: ", train_Unigram_perplexity)

Perplexity of Unigram Model on train data:  1165.658690125828


Bigram on Train Data

In [302]:
train_prob_bi_sm_words = {}
for key in bigram_counts:
    w1,w2 = key.split(" ")
    train_prob_bi_sm_words[key] = (bigram_counts[key] + 1) / (unigram_counts[w1] + V_train)

In [303]:
train_bi_perplexity_sm = {}
train_bi_pp_sm = 0
for id in processed_corpus:
    tokens = ["<s>"]
    tokens += processed_corpus[id]
    tokens.append("</s>")
    N = len(tokens)
    temp = 0
    for i in range(0,N-1):
        str = tokens[i] + " " + tokens[i+1]
        if(str in train_prob_bi_sm_words):
            temp += log(train_prob_bi_sm_words[str])
        elif (tokens[i] in unigram_counts):
            print("bhahhhhh")
            p = 1 / (unigram_counts[tokens[i]] + V_train)
            temp += log(p)
        else:
            print("booooooo")
            p = 1/V_train
            temp += log(p)
    temp = (-temp)/N
    train_bi_perplexity_sm[id] = exp(temp)
    train_bi_pp_sm += train_bi_perplexity_sm[id]

In [304]:
train_Bigram_perplexity = train_bi_pp_sm/len(processed_corpus)
print("Perplexity of Bigram Model on train data: ", train_Bigram_perplexity)

Perplexity of Bigram Model on train data:  1424.1571861376087


Trigram on Train Data

In [312]:
train_prob_tri_sm_words = {}
for key in trigram_counts:
    w1,w2,w3 = key.split(" ")
    prev = w1 + " " + w2
    if (prev in bigram_counts):
        train_prob_tri_sm_words[key] = (trigram_counts[key] + 1) / (bigram_counts[prev] + V_train)
    else:
        train_prob_tri_sm_words[key] = (trigram_counts[key] + 1) / (unigram_counts[w2] + V_train)

In [313]:
train_tri_perplexity_sm = {}
train_tri_pp_sm = 0
for id in processed_corpus:
    tokens = ["<s>", "<s>"]
    tokens += processed_corpus[id] + ["</s>", "</s>"]
    N = len(tokens)
    temp = 0
    for i in range(0,N-2):
        str = tokens[i] + " " + tokens[i+1] + " " + tokens[i+2]
        if(str in train_prob_tri_sm_words):
            temp += log(train_prob_tri_sm_words[str])
        else:
            prev = tokens[i] + " " + tokens[i+1]
            if (prev in bigram_counts):
                print("A")
                p = 1 /(bigram_counts[prev] + V_train)
            elif (tokens[i+1] in unigram_counts):
                print("B")
                p = 1 /(unigram_counts[tokens[i+1]] + V_train)
            else:
                print("C")
                p = 1/V_train
            temp += log(p)
    temp = (-temp)/N
    train_tri_perplexity_sm[id] = exp(temp)
    train_tri_pp_sm += train_tri_perplexity_sm[id]

In [314]:
train_tri_pp_sm/len(processed_corpus)

3017.846540326872