In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
nltk.download('popular')
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/tushar/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/tushar/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/tushar/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/tushar/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/tushar/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/tushar/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to

True

In [3]:
from nltk.corpus import brown

In [4]:
tset = 'universal'

In [5]:
brown_tagged = brown.tagged_words(tagset=tset)

In [6]:
unique_tags = np.unique([tag for (word,tag) in brown_tagged])

In [7]:
brown_tagged_sents = brown.tagged_sents(tagset=tset)

In [8]:
train_ratio = 0.8

In [9]:
sents = len(brown_tagged_sents)

In [10]:
training_sents = brown_tagged_sents[0:int(sents*train_ratio)]

In [11]:
testing_sents = brown_tagged_sents[int(sents*train_ratio):]

#### Estimation of transition probabilities

In [12]:
tpm_dict = {}

In [13]:
start_dict = {}

In [14]:
for tag in unique_tags:
    tpm_dict[tag] = {}
    start_dict[tag] = 0

In [15]:
for tag in unique_tags:
    for suc_tag in unique_tags:
        tpm_dict[tag][suc_tag] = 0

In [16]:
for sent in training_sents:
    for pos in range(len(sent)-1):
        tpm_dict[sent[pos][1]][sent[pos+1][1]]+=1

In [17]:
for sent in training_sents:
    start_dict[(sent[0][1])]+=1

In [18]:
start_dict

{'.': 2931,
 'ADJ': 1800,
 'ADP': 6261,
 'ADV': 4216,
 'CONJ': 2231,
 'DET': 10665,
 'NOUN': 6800,
 'NUM': 869,
 'PRON': 6385,
 'PRT': 1555,
 'VERB': 2139,
 'X': 20}

In [19]:
def log_start_prob(S1):
    return np.log(start_dict[S1]/(np.sum(list(start_dict.values()))))
    #return 0

In [20]:
def get_tpm_prob(S1,S2):
    return tpm_dict[S1][S2]/np.sum(list(tpm_dict[S1].values()))
    #return 1

In [21]:
def log_tpm_prob(S1,S2):
    return np.log(tpm_dict[S1][S2]/np.sum(list(tpm_dict[S1].values())))
    #return 0

#### Estimation of lexical probabilities

In [22]:
lex_dict = {}

In [23]:
for tag in unique_tags:
    lex_dict[tag] = {}

In [24]:
for sent in training_sents:
    for word in sent:        
        try:
            lex_dict[word[1]][word[0]]+=1
        except:
            lex_dict[word[1]][word[0]]=1

In [25]:
def get_lex_prob(S1,S2):
    try:
        return lex_dict[S1][S2]/np.sum(list(lex_dict[S1].values()))
    except:
        return 1

In [26]:
def log_lex_prob(S1,S2):
    total_words = np.sum(list(lex_dict[S1].values()))
    try:
        w_s1 = lex_dict[S1][S2]
    except:
        return -500
    return np.log(w_s1/total_words)

#### Viterbi algorithm

In [27]:
accuracy = pd.DataFrame(columns=['pred','actual'])

In [29]:
s = 0
for sent in testing_sents:
    s += 1
    prob_dict = {}
    for tag in unique_tags:
        prob_dict[tag] = log_start_prob(tag)

    pos_seq = {}
    for tag in unique_tags:
        pos_seq[tag] = [tag]

    pd.DataFrame(pos_seq)

    brown_tagged_sents

    for word in sent:
        pos_seq_init = pos_seq.copy()
        prob_dict_init = prob_dict.copy()
        for tag in unique_tags:
            p_max = -np.inf
            best_tag = 'X'
            for prev_tag in unique_tags:
                lex_prob = log_lex_prob(prev_tag,word[0])
                p_temp = prob_dict_init[prev_tag] + lex_prob + log_tpm_prob(prev_tag,tag)
                if(p_temp>p_max):
                    p_max = p_temp
                    best_tag = prev_tag
            prob_dict[tag] = p_max 
            pos_seq[tag] = pos_seq_init[best_tag] + list([tag])
        #print(word)
        #display(pd.DataFrame(pos_seq))
        #display(pd.DataFrame(pd.Series(prob_dict)))
    
    
    
    
    for tag in unique_tags:
        p = prob_dict[tag]
        p = p - log_tpm_prob(pos_seq[tag][-2],tag)
        prob_dict[tag] = p
        
    tag_seq = max(prob_dict,key=prob_dict.get)
    
    temp = pd.DataFrame([pos_seq[tag_seq][:-1],[b for (a,b) in sent]]).T.rename(columns={0:'pred',1:'actual'})
    
    temp['word'] = [a for (a,b) in sent]
    
    accuracy = accuracy.append(temp)

    accuracy = accuracy.reset_index(drop=True)
    
    
    if(s%20==0):
        accuracy.to_pickle('accuracy_final_new.pkl')

  


In [None]:
for tag in unique_tags:
    for prev_tag in unique_tags:
        print(prev_tag,tag)
        print(log_tpm_prob(prev_tag,tag))