Imports made

In [None]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank as ptb

import numpy as np
import math
from collections import Counter
from itertools import islice

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


###Necessary functions to implement for the Class Based Model

In [None]:
#Find (word, word) Bigrams in the (Word,POS Tag) Tuples
def find_bigrams(text):
    bi_dict = {}
    for i in range(0, len(text) - 1):
        (first, second) = (text[i][0], text[i+1][0])
        if not (first, second) in bi_dict:
            bi_dict[(first, second)] = 1
        else:
            bi_dict[(first, second)] += 1
    return bi_dict    

In [None]:
#Get word related parameters
def get_params_words(d, unigrams, vocab):
    
    #params to return
    N1_plus_wi1 = dict() 
    N_bigrams = dict()
    lambda_wi = dict()
    P_abs_wi = dict()
    
    P_unif = 1/len(vocab)
    N_unis = sum(unigrams.values())
    N1_plus = len(unigrams) #eqn 11
    
    lambdadot_w = (d/N_unis) * N1_plus #eqn 10
    
    for word in vocab:
        P_abs_wi[word] = (max((unigrams[word] - d), 0)/N_unis) + (lambdadot_w * P_unif) #eqn 7
          
    return P_unif, lambdadot_w, P_abs_wi

#Get POS Tag related parameters
def get_params_pos(d, pos_uni, pos_bi, pos_classes):
    
    N1_plus_pos_1 = dict()
    N_pos_bi = dict() 
    lambda_pos_1 = dict()
    P_abs_pos = dict()
    
    Pos_unif = 1/len(pos_classes)
    N_pos_unis = sum(pos_uni.values()) 
    N1_plus_pos = len(pos_uni) #eqn 17
    
    lambdadot_p = (d/N_pos_unis) * N1_plus_pos #eqn 16
    
    for pos in pos_classes:
        count = 0 #unique (pos,pos) starting with word
        N = 0 #total number of (pos,pos) starting with word
        
        for key, val in pos_bi.items():
            if(key[0] == pos):
                if(val > 0):
                    count += 1
                    N += val
        
        N1_plus_pos_1[pos] = count #eqn 15
        N_pos_bi[pos] = N #no. of (pos1, pos2) beginning with pos1
        
        if N > 0:
            lambdad = (d/N) * N1_plus_pos_1[pos] #eqn 14
        else:
            lambdad = 0
            
        lambda_pos_1[pos] = lambdad
        P_abs_pos[pos] = (max((pos_uni[pos] - d), 0)/N_pos_unis) + (lambdadot_p * Pos_unif) #eqn 13
    
    return Pos_unif, lambdadot_p, N_pos_bi, lambda_pos_1, P_abs_pos

def get_P_abs_wi1_class(wi,wi_1,d,words,words_pos,pos_uni,pos_bi,lambda_pos,P_abs_pos,lambdadot_w, lambdadot_p,
                  word_pos_dict, Pos_unif,P_unif,pos_classes,P_abs_wi):
    
    
    pos_1 = word_pos_dict[wi_1]
    P_abs_w_wi = 0
    
    for pos in pos_classes:
        prob_w = 0
        prob_p = 0
        try:
            if(wi,pos) not in words_pos.keys():
                if pos not in lambda_pos.keys():
                    if wi not in words.keys():
                        prob_w = lambdadot_w * P_unif
                    else:
                        prob_w = P_abs_wi[wi]
                else:
                    prob_w = lambda_pos[pos] * P_abs_wi[wi]
            else:
                prob_w = (max((words_pos[(wi,pos)] - d), 0)/pos_uni[pos]) + (lambda_pos[pos] * P_abs_wi[wi])
        except:
            prob_w = lambdadot_w * P_unif
        
        try:
            if lambda_pos[pos_1] == 0:
                prob_p = lambdadot_pos * Pos_unif
            else:
                prob_p = (max((pos_bi[(pos_1,pos)] - d), 0)/pos_uni[pos_1]) + (lambda_pos[pos_1] * P_abs_pos[pos])
        except:
            prob_p =lambdadot_pos * Pos_unif
    
        P_abs_w_wi += prob_w * prob_p
    
    return P_abs_w_wi

###Necessary Functions to implement for Absolute Discounting Model

In [None]:
def get_params_absdisc(d, bigrams, unigrams, vocab):
    
    #params to return
    N1_plus_wi1 = dict() #eqn 10
    N_bigrams = dict() #denominator of 7, 9
    lambda_wi = dict() #eqn 9
    P_abs_wi = dict() #eqn 8
    
    P_unif = 1/len(vocab)
    N_unis = sum(unigrams.values())
    N1_plus = len(unigrams)
    lambdadot = (d/N_unis) * N1_plus
    
    for word in vocab:
        count = 0 #unique bigrams starting with word
        N = 0 #total number of bigrams starting with word
        
        for key, val in bigrams.items():
            if(key[0] == word):
                if(val > 0):
                    count += 1
                    N += val
        
        N1_plus_wi1[word] = count 
        N_bigrams[word] = N
        
        if N > 0:
            lambdad = (d/N) * N1_plus_wi1[word]
        else:
            lambdad = 0
            
        lambda_wi[word] = lambdad
        P_abs_wi[word] = (max((unigrams[word] - d), 0)/N_unis) + (lambdadot * P_unif)
          
    return P_unif, lambdadot, N1_plus_wi1, N_bigrams, lambda_wi, P_abs_wi


def get_P_abs_wi1_absdisc(wi, wi_1, d, unigrams, bigrams, N_bigrams, lambda_wi, P_abs_wi, lambdadot, P_unif):
    try:
        if((wi_1, wi) not in bigrams.keys()):
            if(wi_1 not in lambda_wi.keys()):
                if(wi not in P_abs_wi.keys()):
                    P_abs_wi1 = lambdadot * P_unif
                else:
                    P_abs_wi1 = P_abs_wi[wi]
            else:
                P_abs_wi1 = (lambda_wi[wi_1] * P_abs_wi[wi])
        else:
            N = N_bigrams[wi_1]
            P_abs_wi1 = (max((bigrams[(wi_1, wi)] - d), 0)/N) + (lambda_wi[wi_1] * P_abs_wi[wi])
    except:
        P_abs_wi1 = lambdadot * P_unif
    return P_abs_wi1

###Find conditional probabilities for Class Model and Absolute Discounting Model

In [None]:
#Find conditional probabilites based on test set for the (Word,POS Tag) tuples
def find_cond_prob_class(d, words, words_pos, vocab, pos_uni, pos_bi, pos_classes, word_pos_dict, bi_test):
    
    cond_prob = dict()
    
    P_unif, lambdadot_w, P_abs_wi = get_params_words(d, words, vocab)
    Pos_unif, lambdadot_p, N_pos_bi, lambda_pos, P_abs_pos = get_params_pos(d, pos_uni, pos_bi, pos_classes)
    
    for k,v in bi_test.items():
        cond_prob[k] = get_P_abs_wi1_class(k[1], k[0],d,words,words_pos,pos_uni,pos_bi,lambda_pos,P_abs_pos,lambdadot_w,
                                    lambdadot_p,word_pos_dict,Pos_unif,P_unif,pos_classes,P_abs_wi)
    return cond_prob

#Find conditional probabilites based on test set for the (Word,Word) tuples
def find_cond_prob_absdisc(d, bi_dict, uni_dict, vocab, bi_test):
    cond_prob = dict()
    
    P_unif, lambdadot, N1_plus_wi1, N_bigrams, lambda_wi, P_abs_wi = get_params_absdisc(d, bi_dict, uni_dict, vocab)
    for k,v in bi_test.items():
        cond_prob[k] = get_P_abs_wi1_absdisc(k[1], k[0], d, uni_dict, bi_dict, N_bigrams, lambda_wi, P_abs_wi, lambdadot, P_unif)
    
    return cond_prob

###Find perplexity for the test corpus

In [None]:
#Find Perplexity
def find_perplexity(bgrams, cond_prob):
    tsum = 0
    s = sum(bgrams.values())
    for k,v in bgrams.items():
        rel_freq = v / s
        tsum -= rel_freq * math.log(cond_prob[k])  
    perplexity = math.exp(tsum)
    return perplexity

###Class based model processing

In [None]:
pp_word_tags = []
POS_classes = ["CD","LS", "SENT", "SYM", "#", "$", "“","\"", "''", "``", "“", "-LRB-", "-RRB-", ",", ":", ".","-NONE-", "”"]

for word, tag in ptb.tagged_words():
    if tag not in POS_classes:
        word = word.lower()
        pp_word_tags.append((word,tag))

word_pos_dict = dict(pp_word_tags) #word:pos dict
word_pos_c = Counter(pp_word_tags) #(word,pos):count dict

In [None]:
#Note: Shuffling the data will give different perplexity results every time the code is run
np.random.shuffle(pp_word_tags)
l = int(4*len(pp_word_tags)/5)

train_set = pp_word_tags[:l] 
test_set = pp_word_tags[l:]

In [None]:
words = [word for word,pos in train_set] #only words
pos = [pos for word,pos in train_set] #only tags

uni_dict = Counter(words) #word:count dict
pos_uni = Counter(pos) #pos:count dict
pos_bi = Counter(zip(pos, islice(pos, 1, None))) #(pos,pos):count dict

vocab = list(uni_dict.keys()) #all unique words
pos_classes = list(pos_uni.keys()) #all unique pos tags

bi_test = find_bigrams(test_set) #(word,word) bigrams in test_set

In [None]:
#Find conditional probabilities
cp = find_cond_prob_class(0.9, uni_dict, word_pos_c, vocab, pos_uni, pos_bi, pos_classes, word_pos_dict, bi_test)

In [None]:
print("Perplexity of Class based model is")
print(find_perplexity(bi_test, cp))

Perplexity of Class based model is
1010.6576210497838


###Absolute Discounting model processing

In [None]:
train_words = words
uni_words = Counter(train_words)
bi_words = find_bigrams(train_set)
vocab_words = list(uni_words.keys())

In [None]:
#Find conditional probabilities
cp2 = find_cond_prob_absdisc(0.9, bi_words, uni_words, vocab_words, bi_test)

In [None]:
print("Perplexity of Absolute Discounting based model is")
print(find_perplexity(bi_test, cp2))

Perplexity of Absolute Discounting based model is
1333.759847490673
