In [None]:
import nltk
import numpy as np
import pandas as pd

nltk.download('popular')
nltk.download('brown')
nltk.download('universal_tagset')

from nltk.corpus import brown

tset = 'universal'

brown_tagged = brown.tagged_words(tagset=tset)

unique_tags = np.unique([tag for (word,tag) in brown_tagged])

brown_tagged_sents = brown.tagged_sents(tagset=tset)

train_ratio = 0.8

sents = len(brown_tagged_sents)

division = 4

train_count = int(sents*train_ratio)

test_count = int(sents*(1-train_ratio))

testing_sents = brown_tagged_sents[-division*test_count:-(division-1)*test_count-1]

training_sents = list(brown_tagged_sents[0:-division*test_count])
training_sents.extend(brown_tagged_sents[-(division-1)*test_count-1:])

len(training_sents), len(testing_sents)

#### Estimation of transition probabilities

tpm_dict = {}

start_dict = {}

for tag in unique_tags:
    tpm_dict[tag] = {}
    start_dict[tag] = 0

for tag in unique_tags:
    for suc_tag in unique_tags:
        tpm_dict[tag][suc_tag] = 0

for sent in training_sents:
    for pos in range(len(sent)-1):
        tpm_dict[sent[pos][1]][sent[pos+1][1]]+=1

for sent in training_sents:
    start_dict[(sent[0][1])]+=1

def log_start_prob(S1):
    return np.log(start_dict[S1]/(np.sum(list(start_dict.values()))))
    #return 0

def get_tpm_prob(S1,S2):
    return tpm_dict[S1][S2]/np.sum(list(tpm_dict[S1].values()))
    #return 1

def log_tpm_prob(S1,S2):
    return np.log(tpm_dict[S1][S2]/np.sum(list(tpm_dict[S1].values())))
    #return 0

#### Estimation of lexical probabilities

lex_dict = {}

for tag in unique_tags:
    lex_dict[tag] = {}

for sent in training_sents:
    for word in sent:        
        try:
            lex_dict[word[1]][word[0]]+=1
        except:
            lex_dict[word[1]][word[0]]=1

def get_lex_prob(S1,S2):
    try:
        return lex_dict[S1][S2]/np.sum(list(lex_dict[S1].values()))
    except:
        return 1

def log_lex_prob(S1,S2):
    total_words = np.sum(list(lex_dict[S1].values()))
    try:
        w_s1 = lex_dict[S1][S2]
    except:
        return -500
    return np.log(w_s1/total_words)

#### Viterbi algorithm

accuracy = pd.DataFrame(columns=['pred','actual'])

s = 0
for sent in testing_sents:
    s += 1
    prob_dict = {}
    for tag in unique_tags:
        prob_dict[tag] = log_start_prob(tag)

    pos_seq = {}
    for tag in unique_tags:
        pos_seq[tag] = [tag]

    pd.DataFrame(pos_seq)

    brown_tagged_sents

    for word in sent:
        pos_seq_init = pos_seq.copy()
        prob_dict_init = prob_dict.copy()
        for tag in unique_tags:
            p_max = -np.inf
            best_tag = 'X'
            for prev_tag in unique_tags:
                lex_prob = log_lex_prob(prev_tag,word[0])
                p_temp = prob_dict_init[prev_tag] + lex_prob + log_tpm_prob(prev_tag,tag)
                if(p_temp>p_max):
                    p_max = p_temp
                    best_tag = prev_tag
            prob_dict[tag] = p_max 
            pos_seq[tag] = pos_seq_init[best_tag] + list([tag])
        #print(word)
        #display(pd.DataFrame(pos_seq))
        #display(pd.DataFrame(pd.Series(prob_dict)))
    
    
    
    
    for tag in unique_tags:
        p = prob_dict[tag]
        p = p - log_tpm_prob(pos_seq[tag][-2],tag)
        prob_dict[tag] = p
        
    tag_seq = max(prob_dict,key=prob_dict.get)
    
    temp = pd.DataFrame([pos_seq[tag_seq][:-1],[b for (a,b) in sent]]).T.rename(columns={0:'pred',1:'actual'})
    
    temp['word'] = [a for (a,b) in sent]
    
    accuracy = accuracy.append(temp)

    accuracy = accuracy.reset_index(drop=True)
    
    
    if(s%200==0):
        accuracy.to_pickle('accuracy_final_new_'+str(division)+'.pkl')



[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [None]:
from google.colab import files

files.download('accuracy_final_new_'+str(division)+'.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
1

1

In [None]:
1

1