## POS tagging using modified Viterbi

#### Flowchart of the solution -

<ul>
    <li>EDA to understand training corpus.</li>
    <li>Plain vanilla model building.</li>
    <li>Test plain vanilla model on test set and understand the problem.</li>
    <li>Refining viterbi model using other pos tagging technique</li>
</ul>

### Data Preparation

In [315]:
import nltk
nltk.download('treebank')
import nltk
nltk.download('universal_tagset')
import nltk
nltk.download('punkt')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\abc\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\abc\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [316]:
#Importing libraries
#Importing libraries
import re
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [317]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [318]:
# Splitting into train and test
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))


3718
196


In [319]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95730

In [320]:
#no of tagged words available in dataset - 95790

In [321]:
#Let's see how many unique tags we have in our dataset
tags = [tup[1]  for sen in nltk_data for tup in sen]
print(len(set(tags)))
print(set(tags))


12
{'ADJ', 'ADP', 'CONJ', 'X', 'NOUN', '.', 'NUM', 'DET', 'PRON', 'PRT', 'VERB', 'ADV'}


In [322]:
#we can see that universal dataset has only 12 tags
#Let's see how many unique word dataset has
voc = [tup[0]  for sen in nltk_data for tup in sen]
print(len(voc))
#total no. words present in dataset (including duplicate) 

100676


In [323]:
tags = set(tags)
voc = set(voc)

In [324]:
#print all the available tags
print(tags)

{'ADJ', 'ADP', 'CONJ', 'X', 'NOUN', '.', 'NUM', 'DET', 'PRON', 'PRT', 'VERB', 'ADV'}


In [325]:
#this method we'll use to understand incorrect tags
def plot_cnt_words(word):
    l_words = []
    for tag in tags:
        c = 0
        for w,t in train_tagged_words:
            if (w==word)&(t==tag):
                    c += 1
        if c > 0:
            l_words.append((tag,c))
    print(l_words)
        

In [326]:
#demo
plot_cnt_words("He")

[('PRON', 72)]


### Build the vanilla Viterbi based POS tagger
Let's build HMM viterbi model.

In [327]:
#first step is to find emission and transition probablities
t = len(tags)
v = len(voc)
w_given_t = np.zeros((t, v))

In [328]:
w_given_t.shape

(12, 12408)

In [329]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [330]:
# let's check w
print(word_given_tag('do', 'VERB'))
print(word_given_tag('does', 'VERB'))
print(word_given_tag('flight', 'NOUN'), "\n")
#flight word is not present in the dictionary

(81, 12895)
(52, 12895)
(0, 27433) 



In [331]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [332]:
# examples
print(t2_given_t1(t2='NOUN', t1='ADV'))

(96, 3011)


In [333]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [334]:
tags_matrix

array([[6.66776150e-02, 7.76810646e-02, 1.74084418e-02, 2.13499758e-02,
        6.97487295e-01, 6.53637722e-02, 2.08572838e-02, 4.92691761e-03,
        4.92691761e-04, 1.06749879e-02, 1.21530630e-02, 4.92691761e-03],
       [1.06167309e-01, 1.71788298e-02, 9.60307312e-04, 3.51045653e-02,
        3.21916342e-01, 3.96926999e-02, 6.35936856e-02, 3.23730260e-01,
        6.83952197e-02, 1.38711056e-03, 8.42936430e-03, 1.34443026e-02],
       [1.19796485e-01, 5.27289547e-02, 4.62534692e-04, 8.78815912e-03,
        3.44125807e-01, 3.46901007e-02, 4.20906581e-02, 1.20721556e-01,
        5.96669763e-02, 5.08788181e-03, 1.56799257e-01, 5.50416298e-02],
       [1.73511617e-02, 1.45813435e-01, 1.03470236e-02, 7.33842701e-02,
        6.28780648e-02, 1.62687048e-01, 2.70614447e-03, 5.44412620e-02,
        5.53963706e-02, 1.84972942e-01, 2.04393506e-01, 2.56287809e-02],
       [1.22115696e-02, 1.76612109e-01, 4.23212908e-02, 2.87245288e-02,
        2.65045762e-01, 2.40841329e-01, 9.25892219e-03, 1.31

In [335]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

In [336]:
#this dataframe is usefull to calculate tag probablities
tags_df

Unnamed: 0,ADJ,ADP,CONJ,X,NOUN,.,NUM,DET,PRON,PRT,VERB,ADV
ADJ,0.066678,0.077681,0.017408,0.02135,0.697487,0.065364,0.020857,0.004927,0.000493,0.010675,0.012153,0.004927
ADP,0.106167,0.017179,0.00096,0.035105,0.321916,0.039693,0.063594,0.32373,0.068395,0.001387,0.008429,0.013444
CONJ,0.119796,0.052729,0.000463,0.008788,0.344126,0.03469,0.042091,0.120722,0.059667,0.005088,0.156799,0.055042
X,0.017351,0.145813,0.010347,0.073384,0.062878,0.162687,0.002706,0.054441,0.055396,0.184973,0.204394,0.025629
NOUN,0.012212,0.176612,0.042321,0.028725,0.265046,0.240841,0.009259,0.013159,0.004484,0.043743,0.146429,0.017169
.,0.044273,0.090518,0.058971,0.026976,0.221635,0.09231,0.081197,0.172522,0.065962,0.00233,0.09007,0.053146
NUM,0.034168,0.035346,0.013255,0.210604,0.352283,0.116642,0.18704,0.003535,0.001473,0.025331,0.017968,0.002356
DET,0.205141,0.009533,0.000362,0.045734,0.637384,0.018101,0.022565,0.005792,0.003741,0.000241,0.038977,0.012429
PRON,0.07366,0.021982,0.005399,0.092557,0.207482,0.041265,0.007327,0.009641,0.007713,0.013112,0.485538,0.034323
PRT,0.086145,0.021618,0.002293,0.013102,0.244022,0.044219,0.055355,0.102195,0.017032,0.001638,0.40321,0.009171


## Viterbi Algorithm - Simple

In [337]:
len(train_tagged_words)

95730

In [338]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [339]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]
#test_run = test_set

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
#test_run

In [340]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [341]:
print("Time taken in seconds: ", difference)
#print(tagged_seq)
#print(test_run_base)

Time taken in seconds:  26.41718316078186


In [342]:
# check accuracy of the model
def accuracy(tagged_seq,test_run_base = test_run_base):
    check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
    accuracy = len(check)/len(tagged_seq)
    return accuracy
plain_viterbi = accuracy(tagged_seq)

In [343]:
print("Accuracy of the plain veribii model - ", plain_viterbi )

Accuracy of the plain veribii model -  0.8782608695652174


In [344]:
#Above is the accuracy that we got for plain vanilla viterbi algorithm

In [345]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [346]:
#Let's find out incorrect tags
incorrect_tagged_cases

[[('.', '.'), (('Packaging', 'ADJ'), ('Packaging', 'NOUN'))],
 [('some', 'DET'), (('drawbacks', 'ADJ'), ('drawbacks', 'NOUN'))],
 [('.', '.'), (('Periods', 'ADJ'), ('Periods', 'NOUN'))],
 [('the', 'DET'), (('advent', 'ADJ'), ('advent', 'NOUN'))],
 [('often', 'ADV'), (('more', 'ADJ'), ('more', 'ADV'))],
 [('were', 'VERB'), (('undergoing', 'ADJ'), ('undergoing', 'VERB'))],
 [('-LRB-', '.'), (('1973-75', 'ADJ'), ('1973-75', 'NUM'))],
 [(',', '.'), (('1937-40', 'ADJ'), ('1937-40', 'NUM'))],
 [('and', 'CONJ'), (('1928-33', 'ADJ'), ('1928-33', 'NUM'))],
 [('company', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [('that', 'DET'), (('*T*-146', 'ADJ'), ('*T*-146', 'X'))],
 [('Michael', 'NOUN'), (('Stark', 'ADJ'), ('Stark', 'NOUN'))],
 [('at', 'ADP'), (('Robertson', 'ADJ'), ('Robertson', 'NOUN'))],
 [(',', '.'), (('Stephens', 'ADJ'), ('Stephens', 'NOUN'))]]

In [347]:
#Now let's test this model on test sentences which contains words which are not present in the training dataset.

## 4. Evaluating on Test Set

We will going to test this model on test sentences.

In [348]:
## Testing of unknown word

def test_vertibi_simple(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi(words)
    return tagged_seq

In [349]:
test_vertibi_simple("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'ADJ'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'ADJ'),
 ('worldwide', 'ADJ'),
 ('on', 'ADP'),
 ('smartphones', 'ADJ'),
 ('since', 'ADP'),
 ('2011', 'ADJ'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'ADJ'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Android,OS,smartphones are nouns but incorrectly tagged, as emission probs will be 0 and hence it will assign tag which has o value. Let's see two more exaples.

In [350]:
test_vertibi_simple("Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.")

[('Google', 'ADJ'),
 ('and', 'CONJ'),
 ('Twitter', 'ADJ'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'ADJ'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'ADJ'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'ADJ'),
 ("'s", 'PRT'),
 ('firehose', 'ADJ'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Google and Twitter are nouns but incorrectly tagged, as emission probs for Android will be 0 and hence it will assign tag which has o value. Let's see two more exaples.

In [351]:
test_vertibi_simple("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'ADJ'),
 ('invited', 'ADJ'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'ADJ'),
 ('Satellite', 'ADJ'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font> Same problem.

### Now we will find ways by which we can correct these incorrect tags.

In [352]:
#Let's do some analysis for gerunds
gerund = [(w, t) for w,t in train_tagged_words if re.search('.*ing$',w)]
print("Total gerunds = ",len(gerund))
noun = [(w,t) for w,t in gerund if t =='NOUN'  ]
print("Total noun gerunds = ",len(noun))
verb = [(w,t) for w,t in gerund if t =='VERB'  ]
print("Total verb gerunds = ",len(verb))
#So we have gerunds which are coming as verb and noun pos tag

Total gerunds =  2402
Total noun gerunds =  779
Total verb gerunds =  1402


In [353]:
#Let's see some of them
display(noun[0:5])
display(verb[0:5])

[('consulting', 'NOUN'),
 ('something', 'NOUN'),
 ('trading', 'NOUN'),
 ('manufacturing', 'NOUN'),
 ('spring', 'NOUN')]

[('trying', 'VERB'),
 ('bundling', 'VERB'),
 ('targeting', 'VERB'),
 ('regulating', 'VERB'),
 ('having', 'VERB')]

<font color = 'red'>[Observation]:</font>  Above analysis suggests that we should tag unknow gerund as VERB as probablity of it being VERB will be higher than noun

In [354]:
#Let's do same analysis for ed verbs
verb_ed = [(w, t) for w,t in train_tagged_words if re.search('.*ed$',w)]
print("Total verb_ed  =",len(verb_ed))
noun = [(w,t) for w,t in verb_ed if t =='NOUN'  ]
print("Total noun verb_ed = ",len(noun))
verb = [(w,t) for w,t in verb_ed if t =='VERB'  ]
print("Total verb verb_ed = ",len(verb))
#so unknown ed should be attach to VERB


Total verb_ed  = 3040
Total noun verb_ed =  72
Total verb verb_ed =  2659


<font color = 'red'>[Observation]:</font>  Above analysis suggests that we should tag unknow ed words as VERB as probablity of it being VERB will be higher than noun

<b> Here are some of the observation that we've seen. </b>
<ul>
  <li>Unkown words are getting default tag.</li>
  <li></li>
</ul>

In [355]:
#So we need to use other techni#sameques to rectify this issue

### Solve the problem of unknown words
We will use Rule based POS to solve the problem of unknown words

#### Technique 1:

In [356]:
#Let's define Rule base tagger
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    #(r'.*es$', 'VBZ'),               # 3rd singular present
    #(r'.*ould$', 'MD'),              # modals
    #(r'.*\'s$', 'NN$'),              # possessive nouns
    (r'([A-z]|\*)*\*+', 'X'),                # plural nouns
    (r'^-?[1-9]+(.[1-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                    # nouns
]

regexp_tagger = nltk.RegexpTagger(patterns)

In [357]:
# Viterbi Heuristic algorithm after making changes 
def Viterbi_adv(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        if max(p) == 0 :
            word = word.replace(',','').replace('.','')
            state_max = regexp_tagger.tag([word])[0][1]
        if word.isnumeric() & (word != '0'):
            state_max = 'NUM'
        state.append(state_max)
    return list(zip(words, state))

In [358]:
def test_vertibi_simple_adv(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi_adv(words)
    return tagged_seq

In [359]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_adv(test_tagged_words)
end = time.time()
difference = end-start
adv_viterbi = accuracy(tagged_seq)

#### Evaluating tagging accuracy

In [360]:
print("Accuracy of the veribii model (Tech1)- ",adv_viterbi )

Accuracy of the veribii model (Tech1)-  0.9391304347826087


In [361]:
#words which were tagged incorrectly
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('of', 'ADP'), (('major-league', 'NOUN'), ('major-league', 'ADJ'))],
 [('.', '.'), (('Packaging', 'VERB'), ('Packaging', 'NOUN'))],
 [('.', '.'), (('Pro-forma', 'NOUN'), ('Pro-forma', 'ADJ'))],
 [('often', 'ADV'), (('more', 'ADJ'), ('more', 'ADV'))],
 [('undergoing', 'VERB'), (('change', 'VERB'), ('change', 'NOUN'))],
 [(',', '.'), (('1937-40', 'NOUN'), ('1937-40', 'NUM'))],
 [('company', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))]]

In [362]:
test_vertibi_simple_adv("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'NOUN'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

In [363]:
test_vertibi_simple_adv("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'NOUN'),
 ('invited', 'VERB'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'NOUN'),
 ('Satellite', 'NOUN'),
 ('.', '.')]

#### Technique 2:

In [364]:
# Viterbi Heuristic
def Viterbi_adv_tech2(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        p1 = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            p1.append(transition_p)
            
                
        pmax = max(p)
        
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        if (pmax == 0):
                pmax = max(p1)
                state_max = T[p1.index(pmax)] 
        if word.isnumeric() & (word != '0'):
            state_max = 'NUM'
        if 'ed' in word:
            state_max = 'VERB'
        state.append(state_max)
    return list(zip(words, state))

In [365]:
def test_vertibi_simple_adv_tech2(sentence):
    words = word_tokenize(sentence)
    tagged_seq = Viterbi_adv_tech2(words)
    return tagged_seq

In [366]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_adv_tech2(test_tagged_words)
end = time.time()
difference = end-start
adv_viterbi_tech2 = accuracy(tagged_seq)

#### Evaluating tagging accuracy

In [367]:
print("Accuracy of the veribii model (Tech2)- - ",adv_viterbi_tech2 )


Accuracy of the veribii model (Tech2)- -  0.9043478260869565


In [368]:
#words which were tagged incorrectly
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('of', 'ADP'), (('major-league', 'DET'), ('major-league', 'ADJ'))],
 [('.', '.'), (('Pro-forma', 'NOUN'), ('Pro-forma', 'ADJ'))],
 [('often', 'ADV'), (('more', 'ADJ'), ('more', 'ADV'))],
 [('were', 'VERB'), (('undergoing', 'X'), ('undergoing', 'VERB'))],
 [('undergoing', 'VERB'), (('change', 'VERB'), ('change', 'NOUN'))],
 [('-LRB-', '.'), (('1973-75', 'NOUN'), ('1973-75', 'NUM'))],
 [(',', '.'), (('1937-40', 'NOUN'), ('1937-40', 'NUM'))],
 [('and', 'CONJ'), (('1928-33', 'NOUN'), ('1928-33', 'NUM'))],
 [('company', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [('that', 'DET'), (('*T*-146', 'DET'), ('*T*-146', 'X'))],
 [('at', 'ADP'), (('Robertson', 'DET'), ('Robertson', 'NOUN'))]]

In [369]:
test_vertibi_simple_adv_tech2('Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.')

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'DET'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

In [370]:
test_vertibi_simple_adv_tech2("Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.")

[('Google', 'NOUN'),
 ('and', 'CONJ'),
 ('Twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'NUM'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'X'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'VERB'),
 ("'s", 'PRT'),
 ('firehose', 'VERB'),
 ('.', '.')]

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [371]:
print("Tagging accuracy of plain Viterbi algorith - ",plain_viterbi)
print("Tech1 - Tagging accuracy of Viterbi algorith after modification - ",adv_viterbi)
print("Tech2 - Tagging accuracy of Viterbi algorith after modification - ",adv_viterbi_tech2)

Tagging accuracy of plain Viterbi algorith -  0.8782608695652174
Tech1 - Tagging accuracy of Viterbi algorith after modification -  0.9391304347826087
Tech2 - Tagging accuracy of Viterbi algorith after modification -  0.9043478260869565


### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [372]:
test_vertibi_simple_adv("Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.")

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'NOUN'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font>  After modification Android and OS have correctly tagged as nouns.

In [373]:
test_vertibi_simple_adv("NASA invited social media users to experience the launch of ICESAT-2 Satellite.")

[('NASA', 'NOUN'),
 ('invited', 'VERB'),
 ('social', 'ADJ'),
 ('media', 'NOUN'),
 ('users', 'NOUN'),
 ('to', 'PRT'),
 ('experience', 'NOUN'),
 ('the', 'DET'),
 ('launch', 'NOUN'),
 ('of', 'ADP'),
 ('ICESAT-2', 'NOUN'),
 ('Satellite', 'NOUN'),
 ('.', '.')]

<font color = 'red'>[Observation]:</font>  Same for above statement

In [386]:
import threading 
def first(): 
    start = time.time()
    tagged_seq = Viterbi(test_tagged_words)
    end = time.time()
    difference = end-start
    plain = accuracy(tagged_seq)
    print(plain)
    print(difference)
  
def second():
    start = time.time()
    tagged_seq2 = Viterbi_adv(test_tagged_words)
    end = time.time()
    difference = end-start
    adv_viterbi = accuracy(tagged_seq)
    print(adv_viterbi)
    print(difference)
        
        
if __name__ == "__main__": 
    # creating thread 
    start1 = time.time()
    t1 = threading.Thread(target=first) 
    t2 = threading.Thread(target=second) 
  
    # starting thread 1 
    t1.start() 
    # starting thread 2 
    t2.start() 
  
    # wait until thread 1 is completely executed 
    t1.join() 
    # wait until thread 2 is completely executed 
    t2.join() 
    end1 = time.time()
    difference = end1-start1
    print(difference)
    # both threads completely executed 
    print("Done!") 

2.4279835224151612.427983522415161

5.00863790512085
Done!


In [383]:
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
plain = accuracy(tagged_seq)
print(plain)
print(difference)

0.8782608695652174
26.043259859085083
