

## Program to find the probability of POS tag of a word

## Aim:To understand the context of the given sentence by tagging each of words with its Parts of Speech

## Objective:
### To use the Hidden Markov Model to find the POS tags for the given setence by calculating the Transition and the Emission probabilities.

In [None]:
import numpy as np
import nltk
from nltk import pos_tag
from nltk import word_tokenize
import re
from collections import Counter
import pandas as pd

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
txt='Book a car . Park the car . The book is in the car . The car is in the park .'

# Calculate Emission probability

In [None]:
tkn=word_tokenize(txt)
wrd_cnt=Counter(tkn)
print('Word count',wrd_cnt)

Word count Counter({'car': 4, '.': 4, 'the': 3, 'The': 2, 'is': 2, 'in': 2, 'Book': 1, 'a': 1, 'Park': 1, 'book': 1, 'park': 1})


In [None]:
bgrm=list(nltk.bigrams(tkn))
b_count=Counter(bgrm)
print('Bi-gram count\n',b_count)

Bi-gram count
 Counter({('car', '.'): 3, ('the', 'car'): 2, ('.', 'The'): 2, ('is', 'in'): 2, ('in', 'the'): 2, ('Book', 'a'): 1, ('a', 'car'): 1, ('.', 'Park'): 1, ('Park', 'the'): 1, ('The', 'book'): 1, ('book', 'is'): 1, ('The', 'car'): 1, ('car', 'is'): 1, ('the', 'park'): 1, ('park', '.'): 1})


In [None]:
tag=pos_tag(tkn,tagset='universal')
tag_d=dict(tag)
print(tag)
print('Word - Tags',tag_d)

[('Book', 'VERB'), ('a', 'DET'), ('car', 'NOUN'), ('.', '.'), ('Park', 'VERB'), ('the', 'DET'), ('car', 'NOUN'), ('.', '.'), ('The', 'DET'), ('book', 'NOUN'), ('is', 'VERB'), ('in', 'ADP'), ('the', 'DET'), ('car', 'NOUN'), ('.', '.'), ('The', 'DET'), ('car', 'NOUN'), ('is', 'VERB'), ('in', 'ADP'), ('the', 'DET'), ('park', 'NOUN'), ('.', '.')]
Word - Tags {'Book': 'VERB', 'a': 'DET', 'car': 'NOUN', '.': '.', 'Park': 'VERB', 'the': 'DET', 'The': 'DET', 'book': 'NOUN', 'is': 'VERB', 'in': 'ADP', 'park': 'NOUN'}


In [None]:
t_count={}
for i in tag:
  if i!=('.','.'):
    t_count[i]=tag.count(i)
print("Count of word, tags\n",t_count)

Count of word, tags
 {('Book', 'VERB'): 1, ('a', 'DET'): 1, ('car', 'NOUN'): 4, ('Park', 'VERB'): 1, ('the', 'DET'): 3, ('The', 'DET'): 2, ('book', 'NOUN'): 1, ('is', 'VERB'): 2, ('in', 'ADP'): 2, ('park', 'NOUN'): 1}


In [None]:
for i in wrd_cnt:
  print(f'Word : {i}\tTotal : {wrd_cnt[i]}\tTag : {tag_d[i]}')

Word : Book	Total : 1	Tag : VERB
Word : a	Total : 1	Tag : DET
Word : car	Total : 4	Tag : NOUN
Word : .	Total : 4	Tag : .
Word : Park	Total : 1	Tag : VERB
Word : the	Total : 3	Tag : DET
Word : The	Total : 2	Tag : DET
Word : book	Total : 1	Tag : NOUN
Word : is	Total : 2	Tag : VERB
Word : in	Total : 2	Tag : ADP
Word : park	Total : 1	Tag : NOUN


In [None]:
# Count of pos tags
pos_count=dict(Counter(j for i,j in tag))
print(pos_count)

{'VERB': 4, 'DET': 6, 'NOUN': 6, '.': 4, 'ADP': 2}


In [None]:
# Emission matrix
tags=['VERB','DET','NOUN','ADP']
em=pd.DataFrame(columns=tags,index=[i for i in tag_d.keys()])
em

Unnamed: 0,VERB,DET,NOUN,ADP
Book,,,,
a,,,,
car,,,,
.,,,,
Park,,,,
the,,,,
The,,,,
book,,,,
is,,,,
in,,,,


In [None]:
for i in t_count:
  cnt =t_count[i]
  p_count=pos_count[i[1]]
  print(p_count)

4
6
6
4
6
6
6
4
2
6


In [None]:
# To calculate Emission probability
print('Emission Probability Calculation')
for i in t_count: # t_count is count of words with their tags
  cnt=t_count[i]  # i is the word with tag tuple
  # cnt finds the count of that word along with that tag
  p_count=pos_count[i[1]]
  # i[1] is the tag, so p_count finds the count of that particular tag
  e_prob=round(cnt/p_count,2)
  # e_prob finds the prob of numbr of times the word with tag occurs given
  # count of tags
  print(i,'/',i[1],'=',e_prob)
  # storing the prob in df
  em.loc[i[0],i[1]]=e_prob

Emission Probability Calculation
('Book', 'VERB') / VERB = 0.25
('a', 'DET') / DET = 0.17
('car', 'NOUN') / NOUN = 0.67
('Park', 'VERB') / VERB = 0.25
('the', 'DET') / DET = 0.5
('The', 'DET') / DET = 0.33
('book', 'NOUN') / NOUN = 0.17
('is', 'VERB') / VERB = 0.5
('in', 'ADP') / ADP = 1.0
('park', 'NOUN') / NOUN = 0.17


In [None]:
# Emission Matrix
em.fillna(0,inplace=True)
em=em.drop('.',axis=0)
display(em)

Unnamed: 0,VERB,DET,NOUN,ADP
Book,0.25,0.0,0.0,0.0
a,0.0,0.17,0.0,0.0
car,0.0,0.0,0.67,0.0
Park,0.25,0.0,0.0,0.0
the,0.0,0.5,0.0,0.0
The,0.0,0.33,0.0,0.0
book,0.0,0.0,0.17,0.0
is,0.5,0.0,0.0,0.0
in,0.0,0.0,0.0,1.0
park,0.0,0.0,0.17,0.0


# Calculate Transition Probability

In [None]:
tag_d

{'Book': 'VERB',
 'a': 'DET',
 'car': 'NOUN',
 '.': '.',
 'Park': 'VERB',
 'the': 'DET',
 'The': 'DET',
 'book': 'NOUN',
 'is': 'VERB',
 'in': 'ADP',
 'park': 'NOUN'}

In [None]:
# here we add the strt and end tags for calculating prob
txt1='''<s> Book a car </s> <s> Park the car </s> <s> The book is in the car </s> <s> The car is in the park </s>'''
print("\nText\n",txt1)
# we are storing the words as list in l
l=txt1.split(' ')


n_txt=[]
# Here we are replacing the words with tags
# looping through l since we dont have tags for <s> and </s> we leave it as it is
for i in l:
  if i=='<s>':
    n_txt.append('<s>')
  elif i=='</s>':
    n_txt.append('</s>')
  else:
    n_txt.append(tag_d[i])
    # tag_d is word with tag, tag_d[i] will have tag for word i


# Joining the list to form a sentence of tags
n_t=' '.join(n_txt)
print("\nText replaced with tags\n",n_t)


# Count of tags
w_c=Counter(n_txt)

# Bigrams for this sentence
b1=list(nltk.bigrams(n_txt))
bi_d=Counter(b1)
print("\nBi-gram of tagged text\n",bi_d)


# transition matrix
col=['NOUN','VERB','DET','ADP','</s>']
ind=['<s>','NOUN','VERB','DET','ADP']
t=pd.DataFrame(columns=col,index=ind)


Text
 <s> Book a car </s> <s> Park the car </s> <s> The book is in the car </s> <s> The car is in the park </s>

Text replaced with tags
 <s> VERB DET NOUN </s> <s> VERB DET NOUN </s> <s> DET NOUN VERB ADP DET NOUN </s> <s> DET NOUN VERB ADP DET NOUN </s>

Bi-gram of tagged text
 Counter({('DET', 'NOUN'): 6, ('NOUN', '</s>'): 4, ('</s>', '<s>'): 3, ('<s>', 'VERB'): 2, ('VERB', 'DET'): 2, ('<s>', 'DET'): 2, ('NOUN', 'VERB'): 2, ('VERB', 'ADP'): 2, ('ADP', 'DET'): 2})


In [None]:
# Calculation of transition matrix
print('\nCalculating the Transition Probability\n')
for i in b1:
  # b1 is list of bigrams for sentence where words are replaces with tags
  p_n=bi_d[i]
  # p_n stores the count of bigrams tags
  p_d=w_c[i[0]]
  # p_d stores the count of single tag
  t_prob=round(p_n/p_d,2)
  # t_prob finds prob of 2 tags given 1 tag
  print(i,'/',i[0],'=',t_prob,'\n')
  # storing in df
  t.loc[i[0],i[1]]=t_prob

# Transition matrix
t=t.drop('<s>',axis=1)
t=t.drop('</s>',axis=0)
t.fillna(0,inplace=True)
print('\nTransition matrix\n')
display(t)


Calculating the Transition Probability

('<s>', 'VERB') / <s> = 0.5 

('VERB', 'DET') / VERB = 0.5 

('DET', 'NOUN') / DET = 1.0 

('NOUN', '</s>') / NOUN = 0.67 

('</s>', '<s>') / </s> = 0.75 

('<s>', 'VERB') / <s> = 0.5 

('VERB', 'DET') / VERB = 0.5 

('DET', 'NOUN') / DET = 1.0 

('NOUN', '</s>') / NOUN = 0.67 

('</s>', '<s>') / </s> = 0.75 

('<s>', 'DET') / <s> = 0.5 

('DET', 'NOUN') / DET = 1.0 

('NOUN', 'VERB') / NOUN = 0.33 

('VERB', 'ADP') / VERB = 0.5 

('ADP', 'DET') / ADP = 1.0 

('DET', 'NOUN') / DET = 1.0 

('NOUN', '</s>') / NOUN = 0.67 

('</s>', '<s>') / </s> = 0.75 

('<s>', 'DET') / <s> = 0.5 

('DET', 'NOUN') / DET = 1.0 

('NOUN', 'VERB') / NOUN = 0.33 

('VERB', 'ADP') / VERB = 0.5 

('ADP', 'DET') / ADP = 1.0 

('DET', 'NOUN') / DET = 1.0 

('NOUN', '</s>') / NOUN = 0.67 


Transition matrix



Unnamed: 0,NOUN,VERB,DET,ADP,</s>
<s>,0.0,0.5,0.5,0.0,0.0
NOUN,0.0,0.33,0.0,0.0,0.67
VERB,0.0,0.0,0.5,0.5,0.0
DET,1.0,0.0,0.0,0.0,0.0
ADP,0.0,0.0,1.0,0.0,0.0


In [None]:
print('\nEmission Matrix\n')
display(em)
print('\nTransition Matrix\n')
display(t)


Emission Matrix



Unnamed: 0,VERB,DET,NOUN,ADP
Book,0.25,0.0,0.0,0.0
a,0.0,0.17,0.0,0.0
car,0.0,0.0,0.67,0.0
Park,0.25,0.0,0.0,0.0
the,0.0,0.5,0.0,0.0
The,0.0,0.33,0.0,0.0
book,0.0,0.0,0.17,0.0
is,0.5,0.0,0.0,0.0
in,0.0,0.0,0.0,1.0
park,0.0,0.0,0.17,0.0



Transition Matrix



Unnamed: 0,NOUN,VERB,DET,ADP,</s>
<s>,0.0,0.5,0.5,0.0,0.0
NOUN,0.0,0.33,0.0,0.0,0.67
VERB,0.0,0.0,0.5,0.5,0.0
DET,1.0,0.0,0.0,0.0,0.0
ADP,0.0,0.0,1.0,0.0,0.0


In [None]:
# Find the POS tag
# Formula
# p(verb/book,noun)=p(book/verb)*p(verb/noun)
# p(noun/book,verb)=p(book/noun)*p(noun/verb)

# We have 3 conditional statements as we have not converted the words to same
# case, certain words are both lower and upper to differentiate b/w the tags
# we leave it as it is

def prob(word):
  w={}
  # for words with lower case like a, is, in which has only one tag.
  if word.capitalize() not in tag_d.keys():
    # word have only 1 tag so no multiplication
    # em.loc[row,column] ---> em.loc[word,tag]
    prob1=em.loc[word,tag_d[word]]
    tot_prob=prob1
    # appending it to dictionary
    w[tag_d[word]]=tot_prob
    return w

  # for words whose both lower and upper case have same tags like, the The
  elif tag_d[word.capitalize()] == tag_d[word.lower()]:
    # here since words have different case we multiply both the prob
    tot_prob=round(
        em.loc[word.lower(),tag_d[word.lower()]]
                   *em.loc[word.capitalize(),tag_d[word.capitalize()]],5)
    w[tag_d[word]]=round(tot_prob,5)
    return w


  else:
  # for words with capital and lower case having different tags like book, park.
  # to handle ambiguity, we take the tag with highest prob

    # this is for word with lower case we find prob
    prob1=em.loc[word,tag_d[word]]
    prob2=t.loc[tag_d[word.lower()],tag_d[word]]
    tot_prob=prob1*prob2
    w[tag_d[word]]=round(tot_prob,5)

    # for upper case we find prob
    prob_1=em.loc[word.lower(),tag_d[word.lower()]]
    prob_2=t.loc[tag_d[word],tag_d[word.lower()]]
    tot_prob1=prob_1*prob_2
    w[tag_d[word.lower()]]=round(tot_prob1,5)
    # both the prob along with tag is stored in dict
    return w

In [None]:
empty=[]
for i in l:
  if i in ['<s>','</s>']:
    continue
  elif i in empty:
    continue
  else:
    d1=prob(i)
    empty.append(i)
    Keymax = max(zip(d1.values(), d1.keys()))[1]
    print(f'The probability of tag "{i}" is "{Keymax}" ---> {d1}\n')

The probability of tag "Book" is "VERB" ---> {'VERB': 0.0825, 'NOUN': 0.0}

The probability of tag "a" is "DET" ---> {'DET': 0.17}

The probability of tag "car" is "NOUN" ---> {'NOUN': 0.67}

The probability of tag "Park" is "VERB" ---> {'VERB': 0.0825, 'NOUN': 0.0}

The probability of tag "the" is "DET" ---> {'DET': 0.165}

The probability of tag "The" is "DET" ---> {'DET': 0.165}

The probability of tag "book" is "NOUN" ---> {'NOUN': 0.0}

The probability of tag "is" is "VERB" ---> {'VERB': 0.5}

The probability of tag "in" is "ADP" ---> {'ADP': 1.0}

The probability of tag "park" is "NOUN" ---> {'NOUN': 0.0}



In [None]:
# here for words book and park we get tag Noun even when prob is 0 because if
# we find p(noun/verb) it is 0 and when it is multiplied with emission prob we
# get 0. Also it has no other tags unlike Book & Park which has both verb & noun
# as tags. Therefore only Noun is considered.

# Conclusion:
### The Emission Probabilities gives the probability of the word belonging to a particular tag. In the above emission matrix we can see that probability of the word "Book" being a Verb is 25% and probability of "a" being a Determinant is 17% and so on. Next we calculate the probability of the states, that is the transition probability with the help of Bigram Model. Then we find the POS tag for a given word by multiplyting their emission and transition probabilities