<a href="https://colab.research.google.com/github/ManasviAtGitHub/Natural-Language-Processing/blob/main/Part_of_speech_tagging_with_HMM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Refrences : https://github.com/soheil-mp/Natural-Language-Processing-Tutorials/blob/master/Webinar%2003%20-%20Parts%20of%20Speech%20Tagging%20with%20HMM/Webinar%203%20-%20Parts%20of%20Speech%20Tagging%20with%20HMM.ipynb



POS Tagging using nltk

In [1]:
!pip install pomegranate

Collecting pomegranate
  Downloading pomegranate-0.14.8.tar.gz (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 4.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pomegranate
  Building wheel for pomegranate (PEP 517) ... [?25l[?25hdone
  Created wheel for pomegranate: filename=pomegranate-0.14.8-cp37-cp37m-linux_x86_64.whl size=15006487 sha256=8054f9efbd72ee38183db1880af7767b31dcd907fbccaf8c491eca3dac701233
  Stored in directory: /root/.cache/pip/wheels/24/68/69/0eaab474ef1d65abedcd47de8a38ab21d221d329954d7edd24
Successfully built pomegranate
Installing collected packages: pomegranate
Successfully installed pomegranate-0.14.8


In [2]:
# Importing the libraries
import nltk
import re
import numpy as np
import matplotlib.pyplot as plt
import pomegranate


In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
# Sample text
text = "As I was waiting, a man came out of a side room, and at a glance I was sure he must be Long John."

In [5]:
# Lower case
text = text.lower()

# remove punctuation
text = re.sub(r"[^a-zA-Z0-9]", " ", text)

# Tokenizing
text = text.split()
print("Preprocessed text: \n", text)

Preprocessed text: 
 ['as', 'i', 'was', 'waiting', 'a', 'man', 'came', 'out', 'of', 'a', 'side', 'room', 'and', 'at', 'a', 'glance', 'i', 'was', 'sure', 'he', 'must', 'be', 'long', 'john']


In [6]:
# Parts of speech tagging using NLTK
nltk.pos_tag(text)

[('as', 'IN'),
 ('i', 'NN'),
 ('was', 'VBD'),
 ('waiting', 'VBG'),
 ('a', 'DT'),
 ('man', 'NN'),
 ('came', 'VBD'),
 ('out', 'IN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('side', 'NN'),
 ('room', 'NN'),
 ('and', 'CC'),
 ('at', 'IN'),
 ('a', 'DT'),
 ('glance', 'NN'),
 ('i', 'NN'),
 ('was', 'VBD'),
 ('sure', 'JJ'),
 ('he', 'PRP'),
 ('must', 'MD'),
 ('be', 'VB'),
 ('long', 'JJ'),
 ('john', 'NN')]

Hidden Markov Model

In [7]:
# Initiaize the Hidden Markov Model (HMM)
model = pomegranate.HiddenMarkovModel()

In [8]:
### Emission probabilities

# "Noun" emission and state
emission_noun = pomegranate.DiscreteDistribution({"mary": 4/9, 
                                                  "jane": 2/9, 
                                                  "will": 1/9, 
                                                  "spot": 2/9, 
                                                  "can": 0, 
                                                  "see": 0, 
                                                  "pat": 0})
state_noun = pomegranate.State(emission_noun, name = "noun")




# "Modal" emission and state
emission_modal = pomegranate.DiscreteDistribution({"mary": 0, 
                                                  "jane": 0, 
                                                  "will": 3/4, 
                                                  "spot": 0, 
                                                  "can": 1/4, 
                                                  "see": 0, 
                                                  "pat": 0})
state_modal = pomegranate.State(emission_noun, name = "modal")

# "Verb" emission and state
emission_verb = pomegranate.DiscreteDistribution({"mary": 0, 
                                                  "jane": 0, 
                                                  "will": 0, 
                                                  "spot": 1/4, 
                                                  "can": 0, 
                                                  "see": 2/4, 
                                                  "pat": 1/4})
state_verb = pomegranate.State(emission_noun, name = "verb")

In [9]:
# Add the states into our HMM model
model.add_states(state_noun, state_modal, state_verb)

In [10]:
### Transition probabilities

# Transition probabilities from "<S>" to "noun", "modal", "verb"
model.add_transition(model.start, state_noun, 3/4)
model.add_transition(model.start, state_modal, 1/4)
model.add_transition(model.start, state_verb, 0)

# Transition probabilities from "noun" to "noun", modal", "verb"
model.add_transition(state_noun, state_noun, 1/9)
model.add_transition(state_noun, state_modal, 1/3)
model.add_transition(state_noun, state_verb, 1/9)

# Transition probabilities from "modal" to "noun", modal", "verb"
model.add_transition(state_modal, state_noun, 1/4)
model.add_transition(state_modal, state_modal, 0)
model.add_transition(state_modal, state_verb, 3/4)

# Transition probabilities from "verb" to "noun", modal", "verb"
model.add_transition(state_verb, state_noun, 1)
model.add_transition(state_verb, state_modal, 0)
model.add_transition(state_verb, state_verb, 0)

# Transition probabilities from "noun", modal", "verb" to "<E>" 
model.add_transition(state_noun, model.end, 4/9)
model.add_transition(state_modal, model.end, 0)
model.add_transition(state_verb, model.end, 0)

In [11]:
# Finalize the model
model.bake()

In [12]:
# Changing "None-end" to "end"
model.states[-1].name = "<E>"

# Changing "None-end" to "end"
model.states[-2].name = "<S>"

In [None]:
model.states

[{
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "mary" : 0.4444444444444444,
                 "jane" : 0.2222222222222222,
                 "will" : 0.1111111111111111,
                 "spot" : 0.2222222222222222,
                 "can" : 0,
                 "see" : 0,
                 "pat" : 0
             }
         ],
         "frozen" : false
     },
     "name" : "modal",
     "weight" : 1.0
 }, {
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "mary" : 0.4444444444444444,
                 "jane" : 0.2222222222222222,
                 "will" : 0.1111111111111111,
                 "spot" : 0.2222222222222222,
                 "can" : 0,
                 "see" :

In [13]:
# Sample text
text = "Jane will spot Will."
print(text)

Jane will spot Will.


In [14]:
### Preprocessing the text

# Lower case
text = text.lower()

# remove punctuation
text = re.sub(r"[^a-zA-Z0-9]", " ", text)

# Tokenizing
text = text.split()
print("Preprocessed text: \n", text)

Preprocessed text: 
 ['jane', 'will', 'spot', 'will']


In [16]:
### Prediction
viterbi_likelihood, viterbi_path = model.viterbi(text)

In [17]:
viterbi_path

[(3, {
      "class" : "State",
      "distribution" : null,
      "name" : "<S>",
      "weight" : 1.0
  }), (1, {
      "class" : "State",
      "distribution" : {
          "class" : "Distribution",
          "dtype" : "str",
          "name" : "DiscreteDistribution",
          "parameters" : [
              {
                  "mary" : 0.4444444444444444,
                  "jane" : 0.2222222222222222,
                  "will" : 0.1111111111111111,
                  "spot" : 0.2222222222222222,
                  "can" : 0,
                  "see" : 0,
                  "pat" : 0
              }
          ],
          "frozen" : false
      },
      "name" : "noun",
      "weight" : 1.0
  }), (0, {
      "class" : "State",
      "distribution" : {
          "class" : "Distribution",
          "dtype" : "str",
          "name" : "DiscreteDistribution",
          "parameters" : [
              {
                  "mary" : 0.4444444444444444,
                  "jane" : 0.222222222222222

In [18]:

# Get the final viterbi path
viterbi_path = [i_state[1].name for i_state in viterbi_path]

print("The most likely sequence: \n", viterbi_path, "\n")

The most likely sequence: 
 ['<S>', 'noun', 'modal', 'verb', 'noun', '<E>'] 



In [19]:
print("With likelihood of (approximate):\n {:.3f}%".format(np.exp(viterbi_likelihood)*100))

With likelihood of (approximate):
 0.005%
