# A toy example

In [3]:
import jieba
import re
import copy

from sklearn.preprocessing import LabelEncoder


import numpy as np

from collections import OrderedDict

In [4]:
sentence = ['我', '是', '中国人']
emission_prob = {
    ('N', '我'): 0.01,
    ('N', '中国人'): 0.002,
    ('V', '是'): 0.05
}
transition_prob = {
    ('*', 'N'): 0.02,
    ('*', 'V'): 0.0009,
    ('N', 'V'): 0.05,
    ('V', 'N'): 0.04
}

In [5]:
n_taggers = 2
tagger_encoder = LabelEncoder()
encoded_taggers = tagger_encoder.fit_transform(['N', 'V'])

## Algorithm: PDF (P 32-38)

In [6]:
dynamic_table = np.ones([n_taggers, len(sentence)], dtype=np.float32)*(-1)
for j in range(n_taggers):
    tagger_to_tagger = ('*', tagger_encoder.inverse_transform(j))
    word_on_tagger = (tagger_encoder.inverse_transform(j), sentence[0])
    dynamic_table[j, 0] = transition_prob.get(tagger_to_tagger,0.0)*emission_prob.get(word_on_tagger,0.0)
for i in range(1, len(sentence)):
    for j in range(n_taggers):
        word_on_tagger = (tagger_encoder.inverse_transform(j),sentence[i])
        max_at_i = 0.0
        # loop all states in previous step
        for k in range(n_taggers):
            tagger_to_tagger = (tagger_encoder.inverse_transform(k),tagger_encoder.inverse_transform(j))
            prob_k_j = dynamic_table[k, i-1]*transition_prob.get(tagger_to_tagger,0.0)
            if max_at_i < prob_k_j:
                max_at_i = prob_k_j
        dynamic_table[j, i] = max_at_i
        tmp = dynamic_table[j, i]
        dynamic_table[j, i] = dynamic_table[j, i]*emission_prob.get(word_on_tagger, 0.0)

back_pointer = np.ones((len(sentence, )), dtype=np.int32)*(-1)
for i in range(len(sentence) - 1, -1, -1):
    max_at_i = 0.0
    max_j = 0.0
    for j in range(n_taggers):
        if dynamic_table[j, i] > max_at_i:
            max_at_i = dynamic_table[j, i]
            max_j = j
            back_pointer[i] = j

In [7]:
print ('Decoding result: ', tagger_encoder.inverse_transform(back_pointer))

Decoding result:  ['N' 'V' 'N']


## Use nltk to build Hidden Markov Model (optional, NLTK may have bug)

In [8]:
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm

In [9]:
train_data = treebank.tagged_sents()[:3000]

print (train_data[0])

LookupError: 
**********************************************************************
  Resource 'corpora/treebank/combined' not found.  Please use the
  NLTK Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - 'C:\\Users\\haiha/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\haiha\\Anaconda3\\nltk_data'
    - 'C:\\Users\\haiha\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\haiha\\AppData\\Roaming\\nltk_data'
**********************************************************************

In [None]:
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)

In [None]:
print tagger

print tagger.tag("Today is a good day .".split())

print tagger.tag("Joe met Joanne in Delhi .".split())

print tagger.tag("Chicago is the birthplace of Ginny".split())