## POS tagging using modified Viterbi

#### Flowchart of the solution -

<ul>
    <li>EDA to understand training corpus.</li>
    <li>Plain vanilla model building.</li>
    <li>Test plain vanilla model on test set and understand the problem.</li>
    <li>Refining viterbi model using other pos tagging technique</li>
</ul>

### Data Preparation

In [324]:
#Importing libraries
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [325]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [326]:
# Splitting into train and test
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))


3718
196


In [327]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95891

In [328]:
#no of tagged words available in dataset - 95790

In [329]:
#Let's see how many unique tags we have in our dataset
tags = [tup[1]  for sen in nltk_data for tup in sen]
print(len(set(tags)))
print(set(tags))


12
{'PRT', '.', 'ADV', 'NOUN', 'X', 'VERB', 'CONJ', 'ADP', 'NUM', 'ADJ', 'PRON', 'DET'}


In [330]:
#we can see that universal dataset has only 12 tags
#Let's see how many unique word dataset has
voc = [tup[0]  for sen in nltk_data for tup in sen]
print(len(voc))
#total no. words present in dataset (including duplicate) 

100676


In [331]:
tags = set(tags)
voc = set(voc)

In [332]:
#print all the available tags
print(tags)

{'PRT', '.', 'ADV', 'NOUN', 'X', 'VERB', 'CONJ', 'ADP', 'NUM', 'ADJ', 'PRON', 'DET'}


In [333]:
#this method we'll use to understand incorrect tags
def plot_cnt_words(word):
    l_words = []
    for tag in tags:
        c = 0
        for w,t in train_tagged_words:
            if (w==word)&(t==tag):
                    c += 1
        if c > 0:
            l_words.append((tag,c))
    print(l_words)
        

In [334]:
#demo
plot_cnt_words("He")

[('PRON', 70)]


### Build the vanilla Viterbi based POS tagger
Let's build HMM viterbi model.

In [335]:
#first step is to find emission and transition probablities
t = len(tags)
v = len(voc)
w_given_t = np.zeros((t, v))

In [336]:
w_given_t.shape

(12, 12408)

In [337]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [338]:
# let's check w
print(word_given_tag('do', 'VERB'))
print(word_given_tag('does', 'VERB'))
print(word_given_tag('flight', 'NOUN'), "\n")
#flight word is not present in the dictionary

(80, 12908)
(53, 12908)
(0, 27559) 



In [339]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [340]:
# examples
print(t2_given_t1(t2='NOUN', t1='ADV'))

(92, 3014)


In [341]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [342]:
tags_matrix

array([[1.96206663e-03, 4.12034020e-02, 9.48332250e-03, 2.50490516e-01,
        1.30804451e-02, 4.01569664e-01, 1.96206663e-03, 2.02746894e-02,
        5.72269447e-02, 8.60039219e-02, 1.83126219e-02, 9.84303430e-02],
       [2.24658521e-03, 9.25593078e-02, 5.23005016e-02, 2.23490298e-01,
        2.72286125e-02, 8.88749138e-02, 5.73328547e-02, 9.20201316e-02,
        8.14162493e-02, 4.35837545e-02, 6.50611073e-02, 1.73795834e-01],
       [1.26078306e-02, 1.35368288e-01, 8.02919716e-02, 3.05242203e-02,
        2.38885209e-02, 3.45719963e-01, 7.29927002e-03, 1.19442604e-01,
        3.21831442e-02, 1.27737224e-01, 1.49303256e-02, 7.00066388e-02],
       [4.41235155e-02, 2.39631340e-01, 1.71268918e-02, 2.64704823e-01,
        2.92826295e-02, 1.46376863e-01, 4.27083708e-02, 1.76530346e-01,
        9.28916130e-03, 1.20468810e-02, 4.78972401e-03, 1.33894552e-02],
       [1.84235513e-01, 1.63342834e-01, 2.56410260e-02, 6.17283955e-02,
        7.45489076e-02, 2.04653367e-01, 1.06046218e-02, 1.44

In [343]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

In [344]:
#this dataframe is usefull to calculate tag probablities
tags_df

Unnamed: 0,PRT,.,ADV,NOUN,X,VERB,CONJ,ADP,NUM,ADJ,PRON,DET
PRT,0.001962,0.041203,0.009483,0.250491,0.01308,0.40157,0.001962,0.020275,0.057227,0.086004,0.018313,0.09843
.,0.002247,0.092559,0.052301,0.22349,0.027229,0.088875,0.057333,0.09202,0.081416,0.043584,0.065061,0.173796
ADV,0.012608,0.135368,0.080292,0.030524,0.023889,0.34572,0.007299,0.119443,0.032183,0.127737,0.01493,0.070007
NOUN,0.044124,0.239631,0.017127,0.264705,0.029283,0.146377,0.042708,0.17653,0.009289,0.012047,0.00479,0.013389
X,0.184236,0.163343,0.025641,0.061728,0.074549,0.204653,0.010605,0.144666,0.002849,0.016619,0.056505,0.054606
VERB,0.030756,0.034785,0.081965,0.111559,0.217385,0.16912,0.005113,0.090641,0.023706,0.064921,0.035637,0.134413
CONJ,0.005126,0.034483,0.055918,0.349953,0.008854,0.157036,0.000466,0.051258,0.042404,0.117894,0.056384,0.120224
ADP,0.001385,0.040264,0.012889,0.321581,0.035364,0.007989,0.000746,0.01683,0.062633,0.106199,0.069344,0.324776
NUM,0.027409,0.115237,0.002947,0.355438,0.211023,0.018568,0.013263,0.034188,0.185087,0.03242,0.001474,0.002947
ADJ,0.010394,0.063521,0.004455,0.701369,0.021284,0.012044,0.016664,0.077545,0.020459,0.066491,0.00066,0.005115


## Viterbi Algorithm - Simple

In [345]:
len(train_tagged_words)

95891

In [346]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

## 4. Evaluating on Test Set

In [347]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
test_run

[[('Mr.', 'NOUN'),
  ('Yamamoto', 'NOUN'),
  ('insisted', 'VERB'),
  ('that', 'ADP'),
  ('headquarters', 'NOUN'),
  ('had', 'VERB'),
  ("n't", 'ADV'),
  ('approved', 'VERB'),
  ('the', 'DET'),
  ('bids', 'NOUN'),
  (',', '.'),
  ('and', 'CONJ'),
  ('that', 'ADP'),
  ('he', 'PRON'),
  ('did', 'VERB'),
  ("n't", 'ADV'),
  ('know', 'VERB'),
  ('about', 'ADP'),
  ('most', 'ADJ'),
  ('of', 'ADP'),
  ('the', 'DET'),
  ('cases', 'NOUN'),
  ('until', 'ADP'),
  ('Wednesday', 'NOUN'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Bernstein', 'NOUN'),
  (',', '.'),
  ('a', 'DET'),
  ('tall', 'ADJ'),
  (',', '.'),
  ('energetic', 'ADJ'),
  ('man', 'NOUN'),
  ('who', 'PRON'),
  ('*T*-39', 'X'),
  ('is', 'VERB'),
  ('widely', 'ADV'),
  ('respected', 'VERB'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('publishing', 'NOUN'),
  ('executive', 'NOUN'),
  (',', '.'),
  ('has', 'VERB'),
  ('spent', 'VERB'),
  ('much', 'ADJ'),
  ('of', 'ADP'),
  ('his', 'PRON'),
  ('time', 'NOUN'),
  ('in', 'ADP'),
  ('recent', 'ADJ'),
  

In [348]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [349]:
print("Time taken in seconds: ", difference)
print(tagged_seq)
#print(test_run_base)

Time taken in seconds:  28.722251176834106
[('Mr.', 'NOUN'), ('Yamamoto', 'NOUN'), ('insisted', 'VERB'), ('that', 'ADP'), ('headquarters', 'NOUN'), ('had', 'VERB'), ("n't", 'ADV'), ('approved', 'VERB'), ('the', 'DET'), ('bids', 'NOUN'), (',', '.'), ('and', 'CONJ'), ('that', 'DET'), ('he', 'PRON'), ('did', 'VERB'), ("n't", 'ADV'), ('know', 'VERB'), ('about', 'ADP'), ('most', 'ADJ'), ('of', 'ADP'), ('the', 'DET'), ('cases', 'NOUN'), ('until', 'ADP'), ('Wednesday', 'NOUN'), ('.', '.'), ('Mr.', 'NOUN'), ('Bernstein', 'NOUN'), (',', '.'), ('a', 'DET'), ('tall', 'PRT'), (',', '.'), ('energetic', 'ADJ'), ('man', 'NOUN'), ('who', 'PRON'), ('*T*-39', 'X'), ('is', 'VERB'), ('widely', 'ADV'), ('respected', 'VERB'), ('as', 'ADP'), ('a', 'DET'), ('publishing', 'NOUN'), ('executive', 'NOUN'), (',', '.'), ('has', 'VERB'), ('spent', 'VERB'), ('much', 'ADV'), ('of', 'ADP'), ('his', 'PRON'), ('time', 'NOUN'), ('in', 'ADP'), ('recent', 'ADJ'), ('years', 'NOUN'), ('on', 'ADP'), ('human', 'ADJ'), ('rights'

In [350]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

In [351]:
accuracy = len(check)/len(tagged_seq)

In [352]:
accuracy
#92#94.62

0.9496402877697842

In [353]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [354]:
incorrect_tagged_cases

[[('and', 'CONJ'), (('that', 'DET'), ('that', 'ADP'))],
 [('a', 'DET'), (('tall', 'PRT'), ('tall', 'ADJ'))],
 [('spent', 'VERB'), (('much', 'ADV'), ('much', 'ADJ'))],
 [('form', 'NOUN'), (('forces', 'NOUN'), ('forces', 'VERB'))],
 [('a', 'DET'), (('witness', 'PRT'), ('witness', 'NOUN'))],
 [('of', 'ADP'), (('Criminal', 'ADJ'), ('Criminal', 'NOUN'))],
 [('``', '.'), (('Strategic', 'PRT'), ('Strategic', 'ADJ'))]]

In [355]:
#Now let's test this model on test sentences which contains words which are not present in the training dataset.

In [356]:
## Testing

def test_vertibi_simple(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi(words)
    return tagged_seq

In [357]:
test_vertibi_simple("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'PRT'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'PRT'),
 ('worldwide', 'PRT'),
 ('on', 'ADP'),
 ('smartphones', 'PRT'),
 ('since', 'ADP'),
 ('2011', 'PRT'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'PRT'),
 ('.', '.')]

In [358]:
#Android is noun but incorrectly tagged as PRT, as emission prob for Android will be 0 and hence it will assign tag which has 
#o value. Let's see two more exaples.

In [359]:
test_vertibi_simple("Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.")

[('Google', 'PRT'),
 ('and', 'CONJ'),
 ('Twitter', 'PRT'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'PRT'),
 ('that', 'DET'),
 ('gave', 'VERB'),
 ('Google', 'PRT'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'PRT'),
 ("'s", 'VERB'),
 ('firehose', 'PRT'),
 ('.', '.')]

In [360]:
#Same problem for google and twitter

In [361]:
test_vertibi_simple("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'PRT'),
 ('invited', 'PRT'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'PRT'),
 ('Satellite', 'PRT'),
 ('.', '.')]

In [362]:
#same

In [363]:
#So we need to use other techni#sameques to rectify this issue

### Solve the problem of unknown words

In [364]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        if max(p) == 0 :
            state_max = T[3]
        if word.isnumeric() & (word != '0'):
            state_max = T[10]
        state.append(state_max)
    return list(zip(words, state))

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications