## POS tagging using modified Viterbi

#### Flowchart of the solution -

<ul>
    <li>EDA to understand training corpus.</li>
    <li>Plain vanilla model building.</li>
    <li>Test plain vanilla model on test set and understand the problem.</li>
    <li>Refining viterbi model using other pos tagging technique</li>
</ul>

### Data Preparation

In [1]:
import nltk
nltk.download('treebank')
import nltk
nltk.download('universal_tagset')
import nltk
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
#Importing libraries
#Importing libraries
import re
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [3]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
# Splitting into train and test
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))


3718
196


In [5]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95504

In [6]:
#no of tagged words available in dataset - 95790

In [7]:
#Let's see how many unique tags we have in our dataset
tags = [tup[1]  for sen in nltk_data for tup in sen]
print(len(set(tags)))
print(set(tags))


12
{'ADJ', '.', 'NUM', 'VERB', 'X', 'ADV', 'PRON', 'DET', 'PRT', 'ADP', 'CONJ', 'NOUN'}


In [8]:
#we can see that universal dataset has only 12 tags
#Let's see how many unique word dataset has
voc = [tup[0]  for sen in nltk_data for tup in sen]
print(len(voc))
#total no. words present in dataset (including duplicate) 

100676


In [9]:
tags = set(tags)
voc = set(voc)

In [10]:
#print all the available tags
print(tags)

{'ADJ', '.', 'NUM', 'VERB', 'X', 'ADV', 'PRON', 'DET', 'PRT', 'ADP', 'CONJ', 'NOUN'}


In [11]:
#this method we'll use to understand incorrect tags
def plot_cnt_words(word):
    l_words = []
    for tag in tags:
        c = 0
        for w,t in train_tagged_words:
            if (w==word)&(t==tag):
                    c += 1
        if c > 0:
            l_words.append((tag,c))
    print(l_words)
        

In [12]:
#demo
plot_cnt_words("He")

[('PRON', 63)]


### Build the vanilla Viterbi based POS tagger
Let's build HMM viterbi model.

In [13]:
#first step is to find emission and transition probablities
t = len(tags)
v = len(voc)
w_given_t = np.zeros((t, v))

In [14]:
w_given_t.shape

(12, 12408)

In [15]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [16]:
# let's check w
print(word_given_tag('do', 'VERB'))
print(word_given_tag('does', 'VERB'))
print(word_given_tag('flight', 'NOUN'), "\n")
#flight word is not present in the dictionary

(84, 12888)
(53, 12888)
(0, 27335) 



In [17]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [18]:
# examples
print(t2_given_t1(t2='NOUN', t1='ADV'))

(95, 3033)


In [19]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [20]:
tags_matrix

array([[6.55818507e-02, 6.54174909e-02, 2.10387912e-02, 1.23274159e-02,
        2.08744239e-02, 4.43786988e-03, 4.93096653e-04, 4.60223528e-03,
        1.10124918e-02, 7.74161741e-02, 1.75871141e-02, 6.99211061e-01],
       [4.50701676e-02, 9.25692692e-02, 8.13242197e-02, 8.94206539e-02,
        2.68082041e-02, 5.22670038e-02, 6.53112605e-02, 1.74253330e-01,
        2.51889159e-03, 9.12198648e-02, 5.77545874e-02, 2.21392587e-01],
       [3.42182890e-02, 1.16224192e-01, 1.85840711e-01, 1.74041297e-02,
        2.09439531e-01, 3.24483775e-03, 1.17994100e-03, 2.94985250e-03,
        2.74336282e-02, 3.39233056e-02, 1.35693215e-02, 3.54572266e-01],
       [6.49441332e-02, 3.51489745e-02, 2.33550593e-02, 1.69149593e-01,
        2.17256367e-01, 8.31005573e-02, 3.54593433e-02, 1.33224711e-01,
        3.12693976e-02, 9.15580392e-02, 5.50900074e-03, 1.10024832e-01],
       [1.65869221e-02, 1.64114833e-01, 2.71132379e-03, 2.04784691e-01,
        7.57575780e-02, 2.61563007e-02, 5.53429015e-02, 5.58

In [21]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

In [22]:
#this dataframe is usefull to calculate tag probablities
tags_df

Unnamed: 0,ADJ,.,NUM,VERB,X,ADV,PRON,DET,PRT,ADP,CONJ,NOUN
ADJ,0.065582,0.065417,0.021039,0.012327,0.020874,0.004438,0.000493,0.004602,0.011012,0.077416,0.017587,0.699211
.,0.04507,0.092569,0.081324,0.089421,0.026808,0.052267,0.065311,0.174253,0.002519,0.09122,0.057755,0.221393
NUM,0.034218,0.116224,0.185841,0.017404,0.20944,0.003245,0.00118,0.00295,0.027434,0.033923,0.013569,0.354572
VERB,0.064944,0.035149,0.023355,0.16915,0.217256,0.083101,0.035459,0.133225,0.031269,0.091558,0.005509,0.110025
X,0.016587,0.164115,0.002711,0.204785,0.075758,0.026156,0.055343,0.055821,0.182935,0.144179,0.010207,0.061404
ADV,0.130234,0.136499,0.031322,0.345862,0.02275,0.07913,0.014507,0.069568,0.014507,0.117046,0.007254,0.031322
PRON,0.074546,0.041715,0.007725,0.481653,0.0927,0.03399,0.007725,0.010042,0.012746,0.023175,0.005021,0.208961
DET,0.203973,0.017805,0.021439,0.039002,0.045906,0.013203,0.003634,0.005572,0.000242,0.009327,0.000484,0.639414
PRT,0.085211,0.043748,0.05746,0.398629,0.013386,0.009794,0.018609,0.100555,0.001959,0.020895,0.002285,0.24747
ADP,0.107656,0.040532,0.063693,0.008256,0.035063,0.013832,0.06884,0.32211,0.001394,0.016835,0.000858,0.320931


## Viterbi Algorithm - Simple

In [23]:
len(train_tagged_words)

95504

In [24]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [25]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
#test_run = [test_set[i] for i in rndom]
#we'll consider all the dataset
test_run = test_set

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
#test_run

In [26]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [27]:
print("Time taken in seconds: ", difference)
#print(tagged_seq)
#print(test_run_base)

Time taken in seconds:  1004.2994711399078


In [28]:
# check accuracy of the model
def accuracy(tagged_seq,test_run_base = test_run_base):
    check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
    accuracy = len(check)/len(tagged_seq)
    return accuracy
plain_viterbi = accuracy(tagged_seq)

In [29]:
print("Accuracy of the plain veribii model - ", plain_viterbi )

Accuracy of the plain veribii model -  0.9155065738592421


In [30]:
#Above is the accuracy that we got for plain vanilla viterbi algorithm

In [31]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [32]:
#Let's find out incorrect tags
incorrect_tagged_cases

[[('guests', 'NOUN'), (('more', 'ADV'), ('more', 'ADJ'))],
 [('like', 'ADP'), (('royalty', 'ADJ'), ('royalty', 'NOUN'))],
 [('or', 'CONJ'), (('rock', 'ADJ'), ('rock', 'NOUN'))],
 [('to', 'PRT'), (('scrape', 'ADJ'), ('scrape', 'VERB'))],
 [('retired', 'VERB'),
  (('water-authority', 'ADJ'), ('water-authority', 'NOUN'))],
 [('its', 'PRON'), (('finance', 'VERB'), ('finance', 'NOUN'))],
 [('corn-buying', 'ADJ'), (('binge', 'ADJ'), ('binge', 'NOUN'))],
 [('the', 'DET'), (('Soviet', 'ADJ'), ('Soviet', 'NOUN'))],
 [('serious', 'ADJ'), (('bottlenecks', 'ADJ'), ('bottlenecks', 'NOUN'))],
 [('grain', 'NOUN'), (('pipeline', 'ADJ'), ('pipeline', 'NOUN'))],
 [('share', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [('that', 'DET'), (('*T*-119', 'ADJ'), ('*T*-119', 'X'))],
 [('*T*-119', 'X'), (('expires', 'ADJ'), ('expires', 'VERB'))],
 [('.', '.'), (('Use', 'ADJ'), ('Use', 'NOUN'))],
 [("n't", 'ADV'), (('as', 'ADP'), ('as', 'ADV'))],
 [('D.', 'NOUN'), (('Phipps', 'ADJ'), ('Phipps', 'NOUN'))],
 [('

In [33]:
#Now let's test this model on test sentences which contains words which are not present in the training dataset.

## 4. Evaluating on Test Set

We will going to test this model on test sentences.

In [34]:
## Testing of unknown word

def test_vertibi_simple(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi(words)
    return tagged_seq

In [35]:
test_vertibi_simple("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'ADJ'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'ADJ'),
 ('worldwide', 'ADJ'),
 ('on', 'ADP'),
 ('smartphones', 'ADJ'),
 ('since', 'ADP'),
 ('2011', 'ADJ'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'ADJ'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Android,OS,smartphones are nouns but incorrectly tagged, as emission probs will be 0 and hence it will assign tag which has o value. Let's see two more exaples.

In [36]:
test_vertibi_simple("Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.")

[('Google', 'ADJ'),
 ('and', 'CONJ'),
 ('Twitter', 'ADJ'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'ADJ'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'ADJ'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'ADJ'),
 ("'s", 'PRT'),
 ('firehose', 'ADJ'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Google and Twitter are nouns but incorrectly tagged, as emission probs for Android will be 0 and hence it will assign tag which has o value. Let's see two more exaples.

In [37]:
test_vertibi_simple("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'ADJ'),
 ('invited', 'ADJ'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'ADJ'),
 ('Satellite', 'ADJ'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Same problem.

### Problems with Plain Viterbii algorithm - 

1) It's unable to tag unknown words correctly.
2) It's unable to tag numbers properly.

Why it's happening?

When we encounter unknow word then we will get emission probablity for that word is 0 as word_given_tag will have 0 for word count.
Hence all state probabls will be 0, then we'll get default tag as op for unknow words. <br>

We've to find ways by which we can solve this problem. We can see that most of the words which are unknown are nouns, and there are some ed words as well. So we can create a model which will assign noun tag to unknown (non-ed) words. <br>

Let's explore this possibility.

### Now we will find ways by which we can correct these incorrect tags.

In [38]:
#Let's do some analysis for gerunds
gerund = [(w, t) for w,t in train_tagged_words if re.search('.*ing$',w)]
print("Total gerunds = ",len(gerund))
noun = [(w,t) for w,t in gerund if t =='NOUN'  ]
print("Total noun gerunds = ",len(noun))
verb = [(w,t) for w,t in gerund if t =='VERB'  ]
print("Total verb gerunds = ",len(verb))
#So we have gerunds which are coming as verb and noun pos tag

Total gerunds =  2404
Total noun gerunds =  780
Total verb gerunds =  1406


In [39]:
#Let's see some of them
display(noun[0:5])
display(verb[0:5])

[('filing', 'NOUN'),
 ('nothing', 'NOUN'),
 ('trading', 'NOUN'),
 ('trading', 'NOUN'),
 ('meeting', 'NOUN')]

[('sacrificing', 'VERB'),
 ('manufacturing', 'VERB'),
 ('attracting', 'VERB'),
 ('keeping', 'VERB'),
 ('operating', 'VERB')]

<font color = 'red'>[Observation]:</font>  Above analysis suggests that we should tag unknow gerund as VERB as probablity of it being VERB will be higher than noun

In [40]:
#Let's do same analysis for ed verbs
verb_ed = [(w, t) for w,t in train_tagged_words if re.search('.*ed$',w)]
print("Total verb_ed  =",len(verb_ed))
noun = [(w,t) for w,t in verb_ed if t =='NOUN'  ]
print("Total noun verb_ed = ",len(noun))
verb = [(w,t) for w,t in verb_ed if t =='VERB'  ]
print("Total verb verb_ed = ",len(verb))
#so unknown ed should be attach to VERB


Total verb_ed  = 3021
Total noun verb_ed =  68
Total verb verb_ed =  2654


<font color = 'red'>[Observation]:</font>  Above analysis suggests that we should tag unknow ed words as VERB as probablity of it being VERB will be higher than noun

In [41]:
#So we need to use other techniques to rectify this issue

### Solve the problem of unknown words
1) First we will use Rule based POS to solve the problem of unknown words. <br>
2) Then we'll modify viterbii algorithm to consider only state probablities when emission probablity will be 0

#### Technique 1:

In [42]:
#Let's define Rule base tagger
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'([A-z]|\*)*\*+', 'X'),                # plural nouns
    (r'^-?[1-9]+(.[1-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                    # nouns
]

regexp_tagger = nltk.RegexpTagger(patterns)

In [43]:
# Viterbi Heuristic algorithm after making changes 
def Viterbi_adv(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        if max(p) == 0 :
            word = word.replace(',','').replace('.','')
            state_max = regexp_tagger.tag([word])[0][1]
        if word.isnumeric() & (word != '0'):
            state_max = 'NUM'
        state.append(state_max)
    return list(zip(words, state))

In [44]:
def test_vertibi_simple_adv(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi_adv(words)
    return tagged_seq

In [45]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_adv(test_tagged_words)
end = time.time()
difference = end-start
adv_viterbi = accuracy(tagged_seq)

#### Evaluating tagging accuracy

In [46]:
#we can see that tagging accuracy increase dramatically 
print("Accuracy of the veribii model (Tech1)- ",adv_viterbi )

Accuracy of the veribii model (Tech1)-  0.9555297757153906


In [47]:
#words which were tagged incorrectly
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('guests', 'NOUN'), (('more', 'ADV'), ('more', 'ADJ'))],
 [('to', 'PRT'), (('scrape', 'NOUN'), ('scrape', 'VERB'))],
 [('its', 'PRON'), (('finance', 'VERB'), ('finance', 'NOUN'))],
 [('record', 'NOUN'), (('corn-buying', 'VERB'), ('corn-buying', 'ADJ'))],
 [('the', 'DET'), (('Soviet', 'ADJ'), ('Soviet', 'NOUN'))],
 [('share', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [('*T*-119', 'X'), (('expires', 'NOUN'), ('expires', 'VERB'))],
 [('expires', 'VERB'), (('next', 'ADP'), ('next', 'ADJ'))],
 [("n't", 'ADV'), (('as', 'ADP'), ('as', 'ADV'))],
 [('as', 'ADV'), (('troublesome', 'NOUN'), ('troublesome', 'ADJ'))],
 [('M.', 'NOUN'), (('Zayed', 'VERB'), ('Zayed', 'NOUN'))],
 [('of', 'ADP'), (('American-made', 'NOUN'), ('American-made', 'ADJ'))],
 [('``', '.'), (('Designated', 'VERB'), ('Designated', 'NOUN'))],
 [('to', 'PRT'), (('offer', 'NOUN'), ('offer', 'VERB'))],
 [(',', '.'), (('high-speed', 'VERB'), ('high-speed', 'ADJ'))],
 [('no', 'DET'), (('one', 'NUM'), ('one', 'NOUN'))],
 [('is'

In [48]:
#we'll test our model on random statements from  testing data set that we've
test_vertibi_simple_adv("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'NOUN'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

In [49]:
test_vertibi_simple_adv("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'NOUN'),
 ('invited', 'VERB'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'NOUN'),
 ('Satellite', 'NOUN'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font>  Words like NASA, Android which are unknown words are tagged as NOUN. Besides yers are correctly tagged as numbers

#### Technique 2:

Let's build algorithm using second technique (cosidering only state probablity for unknown words)

In [50]:
# Viterbi Heuristic
def Viterbi_adv_tech2(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        p1 = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            p1.append(transition_p)
            
                
        pmax = max(p)
        
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        if (pmax == 0):
                pmax = max(p1)
                state_max = T[p1.index(pmax)] 
        if word.isnumeric() & (word != '0'):
            state_max = 'NUM'
        if 'ed' in word:
            state_max = 'VERB'
        state.append(state_max)
    return list(zip(words, state))

In [51]:
def test_vertibi_simple_adv_tech2(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi_adv_tech2(words)
    return tagged_seq

In [52]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_adv_tech2(test_tagged_words)
end = time.time()
difference = end-start
adv_viterbi_tech2 = accuracy(tagged_seq)

#### Evaluating tagging accuracy

In [53]:
print("Accuracy of the veribii model (Tech2)- - ",adv_viterbi_tech2 )


Accuracy of the veribii model (Tech2)- -  0.9385150812064965


In [54]:
#words which were tagged incorrectly
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('guests', 'NOUN'), (('more', 'ADV'), ('more', 'ADJ'))],
 [('like', 'ADP'), (('royalty', 'DET'), ('royalty', 'NOUN'))],
 [('retired', 'VERB'),
  (('water-authority', 'X'), ('water-authority', 'NOUN'))],
 [('its', 'PRON'), (('finance', 'VERB'), ('finance', 'NOUN'))],
 [('record', 'NOUN'), (('corn-buying', 'NOUN'), ('corn-buying', 'ADJ'))],
 [('the', 'DET'), (('Soviet', 'ADJ'), ('Soviet', 'NOUN'))],
 [('share', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [('that', 'DET'), (('*T*-119', 'DET'), ('*T*-119', 'X'))],
 [('*T*-119', 'X'), (('expires', 'NOUN'), ('expires', 'VERB'))],
 [('expires', 'VERB'), (('next', 'ADP'), ('next', 'ADJ'))],
 [("n't", 'ADV'), (('as', 'ADP'), ('as', 'ADV'))],
 [('as', 'ADV'), (('troublesome', 'DET'), ('troublesome', 'ADJ'))],
 [('$', '.'), (('12,252', 'NOUN'), ('12,252', 'NUM'))],
 [('M.', 'NOUN'), (('Zayed', 'VERB'), ('Zayed', 'NOUN'))],
 [('of', 'ADP'), (('American-made', 'DET'), ('American-made', 'ADJ'))],
 [('``', '.'), (('Designated', 'VERB'), ('Design

In [55]:
test_vertibi_simple_adv_tech2('Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.')

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'DET'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

In [56]:
test_vertibi_simple_adv_tech2("Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.")

[('Google', 'NOUN'),
 ('and', 'CONJ'),
 ('Twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'NUM'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'X'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'VERB'),
 ("'s", 'PRT'),
 ('firehose', 'VERB'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Our second technique is working well as well.

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [57]:
print("Tagging accuracy of plain Viterbi algorith - ",plain_viterbi)
print("Tech1 - Tagging accuracy of Viterbi algorith after modification - ",adv_viterbi)
print("Tech2 - Tagging accuracy of Viterbi algorith after modification - ",adv_viterbi_tech2)

Tagging accuracy of plain Viterbi algorith -  0.9155065738592421
Tech1 - Tagging accuracy of Viterbi algorith after modification -  0.9555297757153906
Tech2 - Tagging accuracy of Viterbi algorith after modification -  0.9385150812064965


### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [58]:
test_vertibi_simple_adv("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'NOUN'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font>  After modification Android and OS have correctly tagged as nouns.

In [59]:
test_vertibi_simple_adv("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'NOUN'),
 ('invited', 'VERB'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'NOUN'),
 ('Satellite', 'NOUN'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font>  Same for above statement

In [60]:
test_vertibi_simple_adv_tech2("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'DET'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

In [61]:
test_vertibi_simple_adv_tech2("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'NOUN'),
 ('invited', 'VERB'),
 ('social', 'ADJ'),
 ('media', 'VERB'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'DET'),
 ('Satellite', 'NOUN'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> So we modified algorithms and achieved desired ouput.