<a href="https://colab.research.google.com/github/HimalayanSaswataBose/POS_Tagging/blob/main/HMM_POS_Tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import numpy as np
import pandas as pd
import pprint, time

Read data

In [None]:
test_tagged_words = []
with open("test.txt", mode = "r") as f:
  g = f.read()
  k = g.split('\n')
  for i in k:
    t = i.split('\t')
    if t[0] != '':
      test_tagged_words.append(t)
print(test_tagged_words)
print(len(test_tagged_words))


In [None]:
train_tagged_words = []
with open("train.txt", mode = "r") as f:
  g = f.read()
  k = g.split('\n')
  for i in k:
    t = i.split('\t')
    if t[0] != '':
      train_tagged_words.append(t)
print(train_tagged_words)
print(len(train_tagged_words))


In [None]:
print("Length of Train sent and test set:")
print(len(train_tagged_words))
print(len(test_tagged_words))

In [None]:
# print some of the tagged words.
print(train_tagged_words[:10])
print(test_tagged_words[:10])

In [None]:
#use set datatype to check how many unique tags are present in training data
tags = []
for i in train_tagged_words:
  if i[1] not in tags:
    tags.append(i[1])
print("Total tags:")
print(len(tags))
print("All tags:")
print(tags)

In [None]:
# check total words in vocabulary
vocab = []
for i in train_tagged_words:
  if i[0] not in vocab:
    vocab.append(i[0])
print("Total vocab:")
print(len(vocab))
print("All vocab:")
print(vocab)

Define Emission Probability and Transition Probability

In [None]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)
# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [None]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [None]:
print(tags_matrix)

In [None]:
# convert the matrix to a dataframe for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
print(tags_df)

Define Viterbi Algorithm

In [None]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = 0
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))

Create Test set

In [None]:
test_run = test_tagged_words
print(len(test_run))
# list of tagged words
test_seq = [tuple(i) for i in test_tagged_words]
print("Test sentences:")
print(test_seq)
# list of untagged words
test_words = [tup[0] for tup in test_seq]

Predict the tags for test set using Viterbi algorithm

In [None]:
#Test 10 sentences to check the accuracy as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_words)
end = time.time()
difference = end-start
print("Predicted sequence:")
print(tagged_seq)
print("Predicted sequence lengh:")
print(len(tagged_seq) )
print(tagged_seq[-10:])
print("Time taken in seconds: ", difference)

Calculate Accuracy

In [None]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_seq) if i == j]
print(check)
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

In [None]:
res = [(i[0], i[1], j[1]) for i, j in zip(tagged_seq, test_seq) if i[0] == j[0]]
with open("res.txt", "w") as f:
  for i in res:
    string = "\t".join(i)
    f.write(string+"\n")