## POS tagging using modified Viterbi

### Data Preparation

In [643]:
#Importing libraries
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pprint, time
from sklearn.model_selection import train_test_split


In [644]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/natashapandya/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [645]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [646]:
len(nltk_data)

3914

### Reading sample file as well to validate later

In [647]:
file = open('Test_sentences.txt', 'r') 
validation_data =  file.read()

In [648]:
validation_tokens = nltk.word_tokenize(validation_data)

### Splitting data into train and test set

In [649]:
#Spliting in train and test set
random.seed(1234)
train_set, test_set = train_test_split(nltk_data, test_size=0.05)

In [650]:
print(len(train_set))
print(len(test_set))
print(test_set[:40])

3718
196
[[('-LCB-', '.'), ('The', 'DET'), ('court', 'NOUN'), ('has', 'VERB'), ('indicated', 'VERB'), ('0', 'X'), ('it', 'PRON'), ('will', 'VERB'), ('rule', 'VERB'), ('on', 'ADP'), ('the', 'DET'), ('case', 'NOUN'), ('by', 'ADP'), ('the', 'DET'), ('end', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('month', 'NOUN'), ('.', '.'), ('-RCB-', '.')], [('--', '.'), ('Pat', 'NOUN'), ("D'Amico", 'NOUN'), ('.', '.')], [('Many', 'ADJ'), ('banks', 'NOUN'), (',', '.'), ('particularly', 'ADV'), ('smaller', 'ADJ'), ('ones', 'NOUN'), (',', '.'), ('were', 'VERB'), ('slow', 'ADJ'), ('*-1', 'X'), ('to', 'PRT'), ('computerize', 'VERB'), ('and', 'CONJ'), ('could', 'VERB'), ("n't", 'ADV'), ('target', 'VERB'), ('market', 'NOUN'), ('niches', 'NOUN'), ('that', 'DET'), ('*T*-199', 'X'), ('would', 'VERB'), ('have', 'VERB'), ('made', 'VERB'), ('the', 'DET'), ('programs', 'NOUN'), ('more', 'ADV'), ('profitable', 'ADJ'), ('.', '.')], [('Friends', 'NOUN'), ('of', 'ADP'), ('Education', 'NOUN'), ('rates', 'VERB'), ('South'

#### Analyzing data

In [651]:
train_tagged_words = [tup for sent in train_set for tup in sent]
train_tagged_words

[('Modifications', 'NOUN'),
 ('*ICH*-3', 'X'),
 ('had', 'VERB'),
 ('been', 'VERB'),
 ('made', 'VERB'),
 ('*-80', 'X'),
 ('to', 'PRT'),
 ('the', 'DET'),
 ('Souper', 'NOUN'),
 ('Combo', 'NOUN'),
 ('product', 'NOUN'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('time', 'NOUN'),
 ('0', 'X'),
 ('the', 'DET'),
 ('issue', 'NOUN'),
 ('was', 'VERB'),
 ('printed', 'VERB'),
 ('*-81', 'X'),
 ('*T*-1', 'X'),
 (',', '.'),
 ('he', 'PRON'),
 ('says', 'VERB'),
 ('0', 'X'),
 ('*T*-2', 'X'),
 (',', '.'),
 ('*-80', 'X'),
 ('making', 'VERB'),
 ('it', 'PRON'),
 ('less', 'ADJ'),
 ('an', 'DET'),
 ('offender', 'NOUN'),
 ('than', 'ADP'),
 ('*', 'X'),
 ('was', 'VERB'),
 ('portrayed', 'VERB'),
 ('*-4', 'X'),
 ('.', '.'),
 ('Areas', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('factory', 'NOUN'),
 ('*ICH*-2', 'X'),
 ('were', 'VERB'),
 ('particularly', 'ADV'),
 ('dusty', 'ADJ'),
 ('where', 'ADV'),
 ('the', 'DET'),
 ('crocidolite', 'NOUN'),
 ('was', 'VERB'),
 ('used', 'VERB'),
 ('*-8', 'X'),
 ('*T*-1', 'X'),
 ('.', '.'),
 ('F

In [652]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['Modifications',
 '*ICH*-3',
 'had',
 'been',
 'made',
 '*-80',
 'to',
 'the',
 'Souper',
 'Combo']

In [653]:
# vocabulary
V = set(tokens)
print(len(V))

11999


In [654]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [655]:
# Check available tags
print(T)

{'ADP', 'PRON', 'ADV', 'DET', 'CONJ', '.', 'X', 'PRT', 'NOUN', 'VERB', 'ADJ', 'NUM'}


In [656]:
# Creating matrix for tags and words
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

### Build the vanilla Viterbi based POS tagger

In [657]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [658]:
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [659]:
tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [660]:
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [661]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [662]:
# list of tagged words
test_run_base = [tup for sent in test_set for tup in sent]
#print(test_run_base)
# list of untagged words
test_tagged_words = [tup[0] for tup in test_run_base]
#test_tagged_words

train_tagged_words = [tup for sent in train_set for tup in sent]

In [663]:
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
print('Time required for execution:',end - start)

Time required for execution: 1159.7331039905548


In [664]:
check = list([i for i, j in zip(tagged_seq, test_run_base) if i == j])
len(check)

4562

In [665]:
accuracy = len(check)/len(tagged_seq)
accuracy

0.8908416324936536

**Vanilla Viterbi is giving around 90 to 91% accuracy

### Validation set performance on vanilla Viterbi

In [666]:
#validation set

validation_tokens

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google',
 '.',
 'Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013',
 '.',
 'Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 'Twitter',
 "'s",
 'firehose',
 '.',
 'Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets',
 '.',
 'Before',
 'entering',
 'politics',
 ',',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality',
 '.',
 'The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup',
 ',',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years',
 '.'

In [667]:
#Accuracy check on validation set
start = time.time()
tagged_seq_validation = Viterbi(validation_tokens)
end = time.time()
print('Time required for execution:',end - start)

Time required for execution: 43.81310415267944


In [668]:
tagged_seq_validation

[('Android', 'ADP'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('Google', 'ADP'),
 ('.', '.'),
 ('Android', 'ADP'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'ADP'),
 ('worldwide', 'ADP'),
 ('on', 'ADP'),
 ('smartphones', 'ADP'),
 ('since', 'ADP'),
 ('2011', 'ADP'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'ADP'),
 ('.', '.'),
 ('Google', 'ADP'),
 ('and', 'CONJ'),
 ('Twitter', 'ADP'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'ADP'),
 ('that', 'DET'),
 ('gave', 'VERB'),
 ('Google', 'ADP'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'ADP'),
 ("'s", 'PRT'),
 ('firehose', 'ADP'),
 ('.', '.'),
 ('Twitter', 'ADP'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('online', 'ADP'),
 ('news', 'NOUN'),
 ('and', 'CONJ'),
 ('social', 'ADJ'),
 ('networking', 'NOUN'),
 ('service', 'NOU

### Observations on validation set

-- Google, Android, Twitter etc are getting identified as ADP instead of NOUN, this is because unknown words are getting tagged by default to ADP (first tag)

-- Numbers are not tagged correctly

### Check incorrect tags in test set to identify modification areas

In [669]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('will', 'VERB'), (('rule', 'NOUN'), ('rule', 'VERB'))],
 [('Pat', 'NOUN'), (("D'Amico", 'ADP'), ("D'Amico", 'NOUN'))],
 [('were', 'VERB'), (('slow', 'VERB'), ('slow', 'ADJ'))],
 [('to', 'PRT'), (('computerize', 'ADP'), ('computerize', 'VERB'))],
 [("n't", 'ADV'), (('target', 'NOUN'), ('target', 'VERB'))],
 [('market', 'NOUN'), (('niches', 'ADP'), ('niches', 'NOUN'))],
 [('that', 'DET'), (('*T*-199', 'ADP'), ('*T*-199', 'X'))],
 [('Education', 'NOUN'), (('rates', 'NOUN'), ('rates', 'VERB'))],
 [('its', 'PRON'), (('study', 'VERB'), ('study', 'NOUN'))],
 [('to', 'PRT'), (('waive', 'ADP'), ('waive', 'VERB'))],
 [('its', 'PRON'), (('protective', 'ADP'), ('protective', 'ADJ'))],
 [('``', '.'), (('golden', 'ADP'), ('golden', 'ADJ'))],
 [('Italian', 'ADJ'), (('biscuit', 'ADP'), ('biscuit', 'NOUN'))],
 [('D.', 'NOUN'), (('Lazzaroni', 'ADP'), ('Lazzaroni', 'NOUN'))],
 [('by', 'ADP'), (('overproduction', 'ADP'), ('overproduction', 'NOUN'))],
 [(',', '.'), (('so', 'ADV'), ('so', 'ADP'))],
 [('t

### Observations on incorrect tagging on test set

-- Unknown words (which are not preset in train set) are getting tagged by default to ADP (first tag).

-- Numbers are not tagged correctly

-- confused in Noun and verb tags, Nouns are getting tagged as verb and vise-versa

### Solving the problem of unknown words

In [670]:
incorrect_tag_word = [[i for i, j in zip(tagged_seq, test_run_base) if i != j]]
incorrect_tag_word

[[('rule', 'NOUN'),
  ("D'Amico", 'ADP'),
  ('slow', 'VERB'),
  ('computerize', 'ADP'),
  ('target', 'NOUN'),
  ('niches', 'ADP'),
  ('*T*-199', 'ADP'),
  ('rates', 'NOUN'),
  ('study', 'VERB'),
  ('waive', 'ADP'),
  ('protective', 'ADP'),
  ('golden', 'ADP'),
  ('biscuit', 'ADP'),
  ('Lazzaroni', 'ADP'),
  ('overproduction', 'ADP'),
  ('so', 'ADV'),
  ('lieutenant', 'ADP'),
  ('Fishman', 'ADP'),
  ('Longwood', 'ADP'),
  ('Floyd', 'ADP'),
  ('Amin', 'ADP'),
  ('Jalaalwalikraam', 'ADP'),
  ('Glenham', 'ADP'),
  ('Knapp', 'ADP'),
  ('Deborah', 'ADP'),
  ('Renee', 'ADP'),
  ('Francis', 'ADP'),
  ('Muscolina', 'ADP'),
  ('Palisades', 'ADP'),
  ('Najarian', 'ADP'),
  ('Minn.', 'ADP'),
  ('Norwick', 'ADP'),
  ('Nesconset', 'ADP'),
  ('views', 'VERB'),
  ('IRAs', 'ADP'),
  ('Output', 'ADP'),
  ('gradually', 'ADP'),
  ('reaches', 'ADP'),
  ('Pakistan', 'ADP'),
  ('Bhutto', 'ADP'),
  ('defeated', 'ADP'),
  ('no-confidence', 'ADP'),
  ('42-year', 'ADP'),
  ('that', 'ADP'),
  ('down', 'ADV'),
  (

# Modification Method -1

### Modifying viterbi to check unknown words and apply only transition probability for them (as in Vanilla Viterbi they are getting tagged by default as ADP)

In [671]:
# Viterbi Heuristic modified
def Viterbi_modified_1(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    tokens = list(set([pair[0] for pair in train_bag]))
    unknown_word = [word for word in words if word not in tokens]
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            if word in unknown_word:
                state_probability = transition_p
            else:
                state_probability = emission_p * transition_p
            
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

### Evaluating accuracy for method 1 on test set

In [672]:
start = time.time()
tagged_seq_1 = Viterbi_modified_1(test_tagged_words)
end = time.time()
print('Time required for execution:',end - start)

Time required for execution: 1127.5584042072296


In [673]:
check_1 = [i for i, j in zip(tagged_seq_1, test_run_base) if i == j]
print(len(check_1))
print(len(test_run_base))
print(len(tagged_seq_1))

4756
5121
5121


In [674]:
accuracy_1 = len(check_1)/len(tagged_seq_1)
accuracy_1

0.9287248584260887

### Accuracy  is enhanced to  93 - 94% 

### Validation set performance with method 1

In [675]:
validation_tokens

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google',
 '.',
 'Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013',
 '.',
 'Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 'Twitter',
 "'s",
 'firehose',
 '.',
 'Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets',
 '.',
 'Before',
 'entering',
 'politics',
 ',',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality',
 '.',
 'The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup',
 ',',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years',
 '.'

In [676]:
start = time.time()
tagged_seq_validation_1 = Viterbi_modified_1(validation_tokens)
end = time.time()
print('Time required for execution:',end - start)

Time required for execution: 45.58622884750366


In [677]:
tagged_seq_validation_1

[('Android', 'NOUN'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('Google', 'DET'),
 ('.', '.'),
 ('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'DET'),
 ('since', 'ADP'),
 ('2011', 'DET'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'DET'),
 ('.', '.'),
 ('Google', 'NOUN'),
 ('and', 'CONJ'),
 ('Twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'DET'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'X'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'VERB'),
 ("'s", 'PRT'),
 ('firehose', 'VERB'),
 ('.', '.'),
 ('Twitter', 'NOUN'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('online', 'NOUN'),
 ('news', 'NOUN'),
 ('and', 'CONJ'),
 ('social', 'ADJ'),
 ('networking', 'NOUN'),
 ('servic

#### Checking differently tagged words in validation set from vanilla viterbi and modified viterbi with method -1

In [574]:
diff = [[i,j] for i,j in zip(tagged_seq_validation_1,tagged_seq_validation) if i != j]
diff

[[('Android', 'NOUN'), ('Android', 'ADP')],
 [('Google', 'DET'), ('Google', 'ADP')],
 [('Android', 'NOUN'), ('Android', 'ADP')],
 [('OS', 'NOUN'), ('OS', 'ADP')],
 [('worldwide', 'NOUN'), ('worldwide', 'ADP')],
 [('smartphones', 'DET'), ('smartphones', 'ADP')],
 [('2011', 'DET'), ('2011', 'ADP')],
 [('tablets', 'DET'), ('tablets', 'ADP')],
 [('2013', 'DET'), ('2013', 'ADP')],
 [('Google', 'NOUN'), ('Google', 'ADP')],
 [('Twitter', 'NOUN'), ('Twitter', 'ADP')],
 [('2015', 'DET'), ('2015', 'ADP')],
 [('that', 'ADP'), ('that', 'DET')],
 [('Google', 'X'), ('Google', 'ADP')],
 [('Twitter', 'VERB'), ('Twitter', 'ADP')],
 [('firehose', 'VERB'), ('firehose', 'ADP')],
 [('Twitter', 'NOUN'), ('Twitter', 'ADP')],
 [('online', 'NOUN'), ('online', 'ADP')],
 [('networking', 'NOUN'), ('networking', 'ADP')],
 [('interact', 'NOUN'), ('interact', 'ADP')],
 [('messages', 'DET'), ('messages', 'ADP')],
 [('known', 'VERB'), ('known', 'ADJ')],
 [('tweets', 'DET'), ('tweets', 'ADP')],
 [('domineering', 'NOUN'

### Comparing the tagging accuracies of the modifications with Method -1 with the vanilla Viterbi algorithm

**Vanilla Viterbi gave 91% accuracy and with modification of method 1 (for unknown words consider only transition probabilities) accuracy obtained is 94%

### Cases which were incorrectly tagged by original POS tagger and got corrected modifications with Method -1

1. Android is getting tagged as NOUN after modification which was coming as ADP by default

2. NASA is getting correctly tagged as Noun

3. FIFA is getting correctly tagged as Noun

4. Google, Twitter is partially getting tagged Noun so still needs more rectification in method-1 but better from vanilla viterbi

# Modification Method -2

### Using Unigram tagger and regex tagger to check performance of test set individually before integrating with Viterbi

-- Checking on incorrect tagged word we got from Vanilla Viterbi

In [678]:
incorrect_word= [tup[0] for tup in incorrect_tag_word[0]]
incorrect_word

['rule',
 "D'Amico",
 'slow',
 'computerize',
 'target',
 'niches',
 '*T*-199',
 'rates',
 'study',
 'waive',
 'protective',
 'golden',
 'biscuit',
 'Lazzaroni',
 'overproduction',
 'so',
 'lieutenant',
 'Fishman',
 'Longwood',
 'Floyd',
 'Amin',
 'Jalaalwalikraam',
 'Glenham',
 'Knapp',
 'Deborah',
 'Renee',
 'Francis',
 'Muscolina',
 'Palisades',
 'Najarian',
 'Minn.',
 'Norwick',
 'Nesconset',
 'views',
 'IRAs',
 'Output',
 'gradually',
 'reaches',
 'Pakistan',
 'Bhutto',
 'defeated',
 'no-confidence',
 '42-year',
 'that',
 'down',
 '11-month-old',
 'Absorbed',
 'doling',
 'out',
 'tidbits',
 'gloss',
 'root',
 'A',
 'auspices',
 'South',
 'African',
 'Items',
 'numbered',
 'about',
 'newcomer',
 'powder',
 'that',
 '*T*-119',
 'expires',
 'more',
 'realestate',
 'will',
 'Topix',
 '16.05',
 '1.46',
 '0.05',
 '2691.19',
 'imagine',
 'racing',
 'Chicago-style',
 'arbitrager',
 'that',
 '*T*-227',
 'stays',
 'ABORTION',
 'RULING',
 'UPHELD',
 'equipped',
 '*-90',
 'front-seat',
 'head

In [679]:
incorrect_tag_word

[[('rule', 'NOUN'),
  ("D'Amico", 'ADP'),
  ('slow', 'VERB'),
  ('computerize', 'ADP'),
  ('target', 'NOUN'),
  ('niches', 'ADP'),
  ('*T*-199', 'ADP'),
  ('rates', 'NOUN'),
  ('study', 'VERB'),
  ('waive', 'ADP'),
  ('protective', 'ADP'),
  ('golden', 'ADP'),
  ('biscuit', 'ADP'),
  ('Lazzaroni', 'ADP'),
  ('overproduction', 'ADP'),
  ('so', 'ADV'),
  ('lieutenant', 'ADP'),
  ('Fishman', 'ADP'),
  ('Longwood', 'ADP'),
  ('Floyd', 'ADP'),
  ('Amin', 'ADP'),
  ('Jalaalwalikraam', 'ADP'),
  ('Glenham', 'ADP'),
  ('Knapp', 'ADP'),
  ('Deborah', 'ADP'),
  ('Renee', 'ADP'),
  ('Francis', 'ADP'),
  ('Muscolina', 'ADP'),
  ('Palisades', 'ADP'),
  ('Najarian', 'ADP'),
  ('Minn.', 'ADP'),
  ('Norwick', 'ADP'),
  ('Nesconset', 'ADP'),
  ('views', 'VERB'),
  ('IRAs', 'ADP'),
  ('Output', 'ADP'),
  ('gradually', 'ADP'),
  ('reaches', 'ADP'),
  ('Pakistan', 'ADP'),
  ('Bhutto', 'ADP'),
  ('defeated', 'ADP'),
  ('no-confidence', 'ADP'),
  ('42-year', 'ADP'),
  ('that', 'ADP'),
  ('down', 'ADV'),
  (

##### Unigram tagger for incorrect tagged words

In [680]:
# Using unigram tagger to validate incorrect tags we got from Vanilla viterbi
unigram_tagger = nltk.UnigramTagger(train_set)
unigram_tagger.evaluate(incorrect_tag_word)

0.19856887298747763

In [681]:
unigram_tagger.tag(incorrect_word)

[('rule', 'NOUN'),
 ("D'Amico", None),
 ('slow', 'VERB'),
 ('computerize', None),
 ('target', 'NOUN'),
 ('niches', None),
 ('*T*-199', None),
 ('rates', 'NOUN'),
 ('study', 'NOUN'),
 ('waive', None),
 ('protective', None),
 ('golden', None),
 ('biscuit', None),
 ('Lazzaroni', None),
 ('overproduction', None),
 ('so', 'ADV'),
 ('lieutenant', None),
 ('Fishman', None),
 ('Longwood', None),
 ('Floyd', None),
 ('Amin', None),
 ('Jalaalwalikraam', None),
 ('Glenham', None),
 ('Knapp', None),
 ('Deborah', None),
 ('Renee', None),
 ('Francis', None),
 ('Muscolina', None),
 ('Palisades', None),
 ('Najarian', None),
 ('Minn.', None),
 ('Norwick', None),
 ('Nesconset', None),
 ('views', 'NOUN'),
 ('IRAs', None),
 ('Output', None),
 ('gradually', None),
 ('reaches', None),
 ('Pakistan', None),
 ('Bhutto', None),
 ('defeated', None),
 ('no-confidence', None),
 ('42-year', None),
 ('that', 'ADP'),
 ('down', 'ADV'),
 ('11-month-old', None),
 ('Absorbed', None),
 ('doling', None),
 ('out', 'PRT'),
 (

***Getting 16- 17% accuracy with only unigram

##### Regex tagger for incorrect tagged words

In [682]:
# Using Regex tagger to check incorrect tags we got from Vanilla viterbi
patterns = [
    (r'.*ing$', 'VERB'),
    (r'.*ed$', 'VERB'),  
    (r'.*es$', 'VERB'),
    (r'.*ould$', 'VERB'), 
    (r'.*\'s$', 'PRT'),               
    (r'.*s$', 'NOUN'), 
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'),
    (r'.*', 'NOUN')
]

regexp_tagger = nltk.RegexpTagger(patterns)

regexp_tagger.evaluate(incorrect_tag_word)

0.055456171735241505

***Getting very low accuracy with only regex

#### Combining both unigram and regex tagger and check on full test set and incorrect tags got from vanilla viterbi

##### Validation on test set

In [683]:
rule_based_tagger = nltk.RegexpTagger(patterns)

# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

lexicon_tagger.evaluate(test_run)

0.96875

**Giving 97% accuracy on full test set

##### Validation on incorrect tagged words we got from Vanilla Viterbi

In [684]:
rule_based_tagger = nltk.RegexpTagger(patterns)

# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

lexicon_tagger.evaluate(incorrect_tag_word)

0.19856887298747763

**Overall accuaracy is 16% on wrongly tagged data

### Adding Unigram and regex to Viterbi code

In [685]:
def Viterbi_modified_2(words, train_bag = train_tagged_words):
    # Viterbi Heuristic
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    patterns = [
    (r'.*ing$', 'VERB'),
    (r'.*ed$', 'VERB'),  
    (r'.*es$', 'VERB'),
    (r'.*ould$', 'VERB'), 
    (r'.*\'s$', 'PRT'),               
    (r'.*s$', 'NOUN'), 
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'),
    (r'.*', 'NOUN')]

    regexp_tagger = nltk.RegexpTagger(patterns)

# lexicon backed up by the rule-based tagger
    lexicon_tagger = nltk.UnigramTagger(train_set, backoff=regexp_tagger)

    #lexicon_tagger.evaluate(incorrect_tag_word
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        state_max = T[p.index(pmax)]
        #state_max = []
        if (pmax == 0):
            
            new_tag = lexicon_tagger.tag((word,state_max))
            
            state_max = new_tag[0][1]
            
        state.append(state_max)
    return list(zip(words, state))


#### Evaluating tagging accuracy

In [686]:
start = time.time()
tagged_seq_2 = Viterbi_modified_2(test_tagged_words)
end = time.time()
print('Time required for execution:',end - start)


Time required for execution: 1098.988477230072


In [687]:
check_2 = [i for i, j in zip(tagged_seq_2, test_run_base) if i == j]
print(len(check_2))
print(len(test_run_base))
print(len(tagged_seq_2))


4815
5121
5121


In [688]:
accuracy = len(check_2)/len(tagged_seq_2)
accuracy

0.9402460456942003

In [689]:
incorrect_tagged_cases_2 = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq_2, test_run_base)) if j[0]!=j[1]]
len(incorrect_tagged_cases_2)

306

### Accuracy is enhanced to 94 - 95% 

### Validation set performance with method 2

In [690]:
validation_tokens

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google',
 '.',
 'Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013',
 '.',
 'Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 'Twitter',
 "'s",
 'firehose',
 '.',
 'Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets',
 '.',
 'Before',
 'entering',
 'politics',
 ',',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality',
 '.',
 'The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup',
 ',',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years',
 '.'

In [691]:
start = time.time()
tagged_seq_validation_2 = Viterbi_modified_2(validation_tokens)
end = time.time()
print('Time required for execution:',end - start)

Time required for execution: 39.940598249435425


In [692]:
tagged_seq_validation_2

[('Android', 'NOUN'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('Google', 'NOUN'),
 ('.', '.'),
 ('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'VERB'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.'),
 ('Google', 'NOUN'),
 ('and', 'CONJ'),
 ('Twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'NUM'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'NOUN'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'NOUN'),
 ("'s", 'PRT'),
 ('firehose', 'NOUN'),
 ('.', '.'),
 ('Twitter', 'NOUN'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('online', 'NOUN'),
 ('news', 'NOUN'),
 ('and', 'CONJ'),
 ('social', 'ADJ'),
 ('networking', 'NOUN'),
 ('s

#### Checking differently tagged words in validation set from vanilla viterbi and modified viterbi with method -2

In [694]:
diff_2 = [[i,j] for i,j in zip(tagged_seq_validation_2,tagged_seq_validation) if i != j]
diff_2

[[('Android', 'NOUN'), ('Android', 'ADP')],
 [('Google', 'NOUN'), ('Google', 'ADP')],
 [('Android', 'NOUN'), ('Android', 'ADP')],
 [('OS', 'NOUN'), ('OS', 'ADP')],
 [('worldwide', 'NOUN'), ('worldwide', 'ADP')],
 [('smartphones', 'VERB'), ('smartphones', 'ADP')],
 [('2011', 'NUM'), ('2011', 'ADP')],
 [('2013', 'NUM'), ('2013', 'ADP')],
 [('Google', 'NOUN'), ('Google', 'ADP')],
 [('Twitter', 'NOUN'), ('Twitter', 'ADP')],
 [('2015', 'NUM'), ('2015', 'ADP')],
 [('that', 'ADP'), ('that', 'DET')],
 [('Google', 'NOUN'), ('Google', 'ADP')],
 [('Twitter', 'NOUN'), ('Twitter', 'ADP')],
 [('firehose', 'NOUN'), ('firehose', 'ADP')],
 [('Twitter', 'NOUN'), ('Twitter', 'ADP')],
 [('online', 'NOUN'), ('online', 'ADP')],
 [('interact', 'NOUN'), ('interact', 'ADP')],
 [('messages', 'VERB'), ('messages', 'ADP')],
 [('known', 'VERB'), ('known', 'ADJ')],
 [('tweets', 'NOUN'), ('tweets', 'ADP')],
 [('domineering', 'VERB'), ('domineering', 'ADP')],
 [('personality', 'NOUN'), ('personality', 'ADP')],
 [('20

### Comparing the tagging accuracies of the modifications from Method -2 with the vanilla Viterbi algorithm and Method -1

-- Vanilla Viterbi gave 91% accuracy and with modification of method 2 (for unknown words running unigram and regex tagger) accuracy obtained is 95%

-- Method 1 (for unknown words using only transition probability) gave 94% accuracy and Method 2 (for unknown words running unigram and regex tagger) accuracy obtained is 95%

### Cases which were incorrectly tagged by original POS tagger and got corrected by Method -2 

1. Numbers like 2018,2013 etc are getting tagged correctly as NUM

2. Android, Google, twitter, NASA identified as Noun correctly

3. Tournament, cups got identified as Noun correctly