# HMM Part-of-Speech Tagging

Accuracy:

* v1.0: ignore sentences containing unknown words
    * 21492/22470 = **0.9564753004005341**
* v1.0: add sentences containing unknown words 
    * 21975/101136 = **0.21728168011390603**
* v1.1: for unknown words, use only the best subsequent tag following tag_prev
    * 93948/101136 = **0.9289273849074513**
* v1.2: p(t|t_prev) in start or end place in a sentence
    * 94016/101136 = **0.9295997468754944**

In [16]:
# 人民日报
# download corpus from the link below, and unzip it
# http://icl.pku.edu.cn/icl_groups/corpus/dwldform1.asp

f = '199801/199801.txt'
with open(f, 'rb') as fin:
    all = fin.read().decode('gbk').split('\n')
print(len(all), end='\n\n')

for line in all[:10]:
    print(line)

22723

19980101-01-001-001/m  迈向/v  充满/v  希望/n  的/u  新/a  世纪/n  ——/w  一九九八年/t  新年/t  讲话/n  （/w  附/v  图片/n  １/m  张/q  ）/w  
19980101-01-001-002/m  中共中央/nt  总书记/n  、/w  国家/n  主席/n  江/nr  泽民/nr  
19980101-01-001-003/m  （/w  一九九七年/t  十二月/t  三十一日/t  ）/w  
19980101-01-001-004/m  １２月/t  ３１日/t  ，/w  中共中央/nt  总书记/n  、/w  国家/n  主席/n  江/nr  泽民/nr  发表/v  １９９８年/t  新年/t  讲话/n  《/w  迈向/v  充满/v  希望/n  的/u  新/a  世纪/n  》/w  。/w  （/w  新华社/nt  记者/n  兰/nr  红光/nr  摄/Vg  ）/w  
19980101-01-001-005/m  同胞/n  们/k  、/w  朋友/n  们/k  、/w  女士/n  们/k  、/w  先生/n  们/k  ：/w  
19980101-01-001-006/m  在/p  １９９８年/t  来临/v  之际/f  ，/w  我/r  十分/m  高兴/a  地/u  通过/p  [中央/n  人民/n  广播/vn  电台/n]nt  、/w  [中国/ns  国际/n  广播/vn  电台/n]nt  和/c  [中央/n  电视台/n]nt  ，/w  向/p  全国/n  各族/r  人民/n  ，/w  向/p  [香港/ns  特别/a  行政区/n]ns  同胞/n  、/w  澳门/ns  和/c  台湾/ns  同胞/n  、/w  海外/s  侨胞/n  ，/w  向/p  世界/n  各国/r  的/u  朋友/n  们/k  ，/w  致以/v  诚挚/a  的/u  问候/vn  和/c  良好/a  的/u  祝愿/vn  ！/w  
19980101-01-001-007/m  １９９７年/t  ，/w  是/v  中国/ns  发展/vn  历史/n  上/f  非

In [8]:
'''
first-order HMM
p(t1..tn, w1..wn) = p(w1..wn|t1..tn) * p(t1..tn)
                  = p(w1|t1)*p(w2|t2)..p(wn|tn) * p(t1)*p(t2|t1)*p(t3|t2)                

'''

f = '199801/199801.txt'
with open(f, 'rb') as fin:
    all = fin.read().decode('gbk').split('\n')

fold = 0.9
train_len = int(len(all) * fold)
train_set = all[:train_len]
test_set = all[train_len:]

from collections import defaultdict # matrix as incursive dict
prob_word_by_tag = dict() # emit matrix
prob_tag_by_tag = dict()  # tran matrix

for line in train_set:
    tokens = line.split()[1:] # ignore 0th timestamp
    prev_tag = None
    for token in tokens:
        if token.strip() == '':
            continue
            
        word, tag = token.split('/')
        # case: 大会堂/n]ns
        if tag.find(']') != -1:
            tag = tag.split(']')[0] # ignore the sub-tag
        
        if not prob_word_by_tag.get(tag, None):
            prob_word_by_tag[tag] = defaultdict(lambda: -float('inf'))
        prob_word_by_tag[tag][word] = prob_word_by_tag[tag].get(word, 0) + 1
        
        if prev_tag:
            if not prob_tag_by_tag.get(prev_tag, None):
                prob_tag_by_tag[prev_tag] = defaultdict(lambda: -float('inf'))
            prob_tag_by_tag[prev_tag][tag] = prob_tag_by_tag[prev_tag].get(tag, 0) + 1
        prev_tag = tag

In [9]:
# normalize

import math
import sys

# use log instead of original probability, to avoid the chain product of probs converges into zero
for tag,words_cnt in prob_word_by_tag.items():
    ntotal = sum([c for w,c in words_cnt.items()])
    for w,c in words_cnt.items():
        words_cnt[w] = math.log(float(c) / ntotal)
for prev_tag,tags_cnt in prob_tag_by_tag.items():
    ntotal = sum([c for t,c in tags_cnt.items()])
    for t,c in tags_cnt.items():
        tags_cnt[t] = math.log(float(c) / ntotal)

In [10]:
print(prob_word_by_tag['v']['改革'])
print(prob_word_by_tag['v']['xxxx'])
print(prob_tag_by_tag['v']['n'])
print(prob_tag_by_tag.keys())
print(prob_word_by_tag.keys())

for t,p in prob_tag_by_tag['z'].items():
    print('z=>{0}: {1}'.format(t,p))

-5.794260869027279
-inf
-1.678308670888908
dict_keys(['Mg', 'r', 'Ng', 'p', 'd', 'l', 'c', 'vvn', 'ad', 'y', 'nz', 'vn', 'Bg', 'nx', 'h', 'm', 'i', 'f', 'nr', 'an', 'z', 'Rg', 'nt', 'Ag', 'k', 'vd', 'o', 'a', 'n', 'q', 'Dg', 'u', 'e', 'w', 'na', 'b', 'Yg', 'j', 't', 'Vg', 'ns', 's', 'Tg', 'v'])
dict_keys(['Mg', 'r', 'Ng', 'p', 'd', 'l', 'c', 'vvn', 'ad', 'y', 'nz', 'vn', 'Bg', 'nx', 'h', 'm', 'i', 'f', 'nr', 'an', 'z', 'Rg', 'nt', 'Ag', 'k', 'vd', 'o', 'a', 'n', 'q', 'Dg', 'u', 'e', 'w', 'na', 'b', 'Yg', 'j', 't', 'Vg', 'ns', 's', 'Tg', 'v'])
z=>i: -5.153291594497779
z=>a: -3.4356400974234456
z=>Ng: -6.406054562993147
z=>p: -4.534252386091556
z=>d: -4.208829985656927
z=>y: -7.099201743553092
z=>c: -4.208829985656927
z=>u: -1.2214659617734531
z=>l: -6.000589454884983
z=>w: -1.661122434629897
z=>vn: -5.307442274325037
z=>n: -1.4970829226733913
z=>b: -6.406054562993147
z=>m: -3.4356400974234456
z=>t: -7.099201743553092
z=>Vg: -6.000589454884983
z=>ns: -7.099201743553092
z=>s: -5.712907382

In [11]:
set1 = set()
for t1,d in prob_tag_by_tag.items():
    print(t1, d.keys())
    set1 = set1.union(d.keys())
set2 = set(prob_tag_by_tag.keys())

assert len(set1) == len(set2)

Mg dict_keys(['a', 'n', 'v'])
r dict_keys(['Ng', 'vd', 'o', 'r', 'a', 'ad', 'p', 'd', 'y', 'q', 'c', 'Dg', 'u', 'l', 'nx', 'nz', 'w', 'vn', 'n', 'b', 'h', 'm', 'j', 't', 'Vg', 'ns', 's', 'i', 'f', 'nr', 'v', 'an', 'z', 'nt', 'Ag', 'Tg'])
Ng dict_keys(['u', 'Ng', 'vd', 'r', 'a', 'f', 'p', 'd', 'l', 'q', 'c', 'Dg', 'ad', 'y', 'nz', 'w', 'vn', 'k', 'n', 'b', 'm', 'j', 't', 'Vg', 'ns', 's', 'i', 'nr', 'v', 'z', 'nt', 'Ag', 'Tg'])
p dict_keys(['u', 'Ng', 'Mg', 'vd', 'o', 'r', 'a', 'ad', 'p', 'd', 'q', 'c', 'f', 'l', 'nx', 'nz', 'w', 'vn', 'n', 'b', 'h', 'm', 'j', 't', 'Vg', 'ns', 's', 'i', 'nr', 'v', 'an', 'z', 'Rg', 'nt', 'Ag', 'Tg'])
d dict_keys(['Ng', 'Mg', 'vd', 'o', 'i', 'a', 'ad', 'p', 'd', 'y', 'q', 'c', 'Dg', 'u', 'l', 'nx', 'nz', 'w', 'vn', 'k', 'n', 'b', 'm', 'j', 't', 'Bg', 'Vg', 'ns', 's', 'r', 'f', 'nr', 'v', 'an', 'z', 'nt', 'Ag', 'Tg'])
l dict_keys(['ad', 'vd', 'r', 'a', 'Ng', 'p', 'd', 'y', 'q', 'c', 'u', 'l', 'nz', 'w', 'vn', 'n', 'b', 'm', 'j', 't', 'Vg', 'ns', 's', 'i', '

In [12]:
# POS Tagging v1.0: ignore sentences containing unknown words 

def tagging(words, all_tags):
    N = len(words)
    mat = list()
    for _ in range(N):
        mat.append(defaultdict(float))
        
    back_trace = defaultdict(list)
    for t in all_tags:
        mat[0][t] = prob_word_by_tag[t][words[0]]
        back_trace[t] = [t]
        
    for idx,w in enumerate(words[1:]):
        idx += 1
        new_back_trace = defaultdict(list)
        for t in all_tags:
            # trick: -float('inf') + 1 == -float('inf') + 2
            score, pt = max((s+prob_word_by_tag[t][w]+prob_tag_by_tag[pt][t],pt) for pt,s in mat[idx-1].items())
            if score == -float('inf'):
                pass # unknown words
                
            mat[idx][t] = score
            new_back_trace[t] = back_trace[pt] + [t]
        back_trace = new_back_trace
        
    max_score, final_tag = max((s,t) for t,s in mat[N-1].items())
    
    return max_score, back_trace[final_tag]
    

all_words = set()
for t,words_cnt in prob_word_by_tag.items():
    all_words = all_words.union(words_cnt.keys())
print('all_words: {0}'.format(len(all_words)))
    
num_matched_total = 0
num_total = 0

cnt=10
for line in test_set:
    tokens = line.split()[1:] # ignore 0th timestamp
    if not tokens:
        continue
    words, tags = zip(*[tk.split('/') for tk in tokens])
    tags = list(map(lambda t: t.split(']')[0] if t.find(']')>=0 else t, tags)) # case: 大会堂/n]ns, ignore the sub-tag
    assert len(words) == len(tags)
    
    # ignore sentence containing unknown words
    has_unknown_word = False
    for w in words:
        if w not in all_words:
            has_unknown_word = True
    if has_unknown_word:
        continue
            
    max_score, tags_predict = tagging(words, prob_tag_by_tag.keys())
    if cnt>0:
        print(tags)
        cnt -= 1
        print(tags_predict)
    assert len(tags) == len(tags_predict)
    
    num_matched_total += len(list(filter(lambda x: x, [t1==t2 for t1,t2 in zip(tags,tags_predict)])))
    num_total += len(tags)

print('{0}/{1}={2}'.format(num_matched_total, num_total, float(num_matched_total)/num_total))

all_words: 53635
['w', 'nx', 'w', 'nx', 'w']
['w', 'nx', 'w', 'nx', 'w']
['v', 'n', 'u', 'v', 'w', 'n', 'j', 'j', 'b', 'n', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'n', 'j', 'b', 'n', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w']
['v', 'n', 'u', 'v', 'w', 'n', 'j', 'j', 'b', 'n', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'n', 'j', 'b', 'n', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w', 'nr', 'nr', 'w']
['nt', 'n', 'v', 't', 'n']
['nt', 'n', 'v', 't', 'n']
['nr', 'nr', 'p', 'nt', 'n', 'vn', 'n', 'v', 'n', 'nr', 'nr', 'u', 'v']
['nr', 'nr', 'p', 'nt', 'n', 'vn', 'n', 'vn', 'n', 'nr', 'nr', 'u', 'v']
['nt', 'ns', 't', 't', 'n', 'w', 'n', 'n', 'vn', 'n', 'n', 'nr', 'nr', 'w', 'nt', 'n', 'nr', 'nr', 'w', 'p', 'ns', 'n', 'u', 'n', 'n', 't', 'n', 'd', 'l', 'w', 'nt', 'n', 't', 't', 'p', 'ns', 'nt', 'a', 'n', 'v', 't', 'n', 'w']
['nt', 'ns', 't', 't', 'n', 'w', 'n', 'n', 'vn', 'n', 'n', 'nr', 'nr', 'w', 'nt', 'n',

In [18]:
# POS Tagging v1.0: add sentences containing unknown words 

def tagging(words, all_tags):
    N = len(words)
    mat = list()
    for _ in range(N):
        mat.append(defaultdict(float))
        
    back_trace = defaultdict(list)
    for t in all_tags:
        mat[0][t] = prob_word_by_tag[t][words[0]]
        back_trace[t] = [t]
        
    for idx,w in enumerate(words[1:]):
        idx += 1
        new_back_trace = defaultdict(list)
        for t in all_tags:
            # trick: -float('inf') + 1 == -float('inf') + 2
            score, pt = max((s+prob_word_by_tag[t][w]+prob_tag_by_tag[pt][t],pt) for pt,s in mat[idx-1].items())
            if score == -float('inf'):
                pass # unknown words
                
            mat[idx][t] = score
            new_back_trace[t] = back_trace[pt] + [t]
        back_trace = new_back_trace
        
    max_score, final_tag = max((s,t) for t,s in mat[N-1].items())
    
    return max_score, back_trace[final_tag]
    

num_matched_total = 0
num_total = 0

cnt=10
for line in test_set:
    tokens = line.split()[1:] # ignore 0th timestamp
    if not tokens:
        continue
    words, tags = zip(*[tk.split('/') for tk in tokens])
    tags = list(map(lambda t: t.split(']')[0] if t.find(']')>=0 else t, tags)) # case: 大会堂/n]ns, ignore the sub-tag
    assert len(words) == len(tags)
            
    max_score, tags_predict = tagging(words, prob_tag_by_tag.keys())
    if cnt>0:
        print(tags)
        cnt -= 1
        print(tags_predict)
    assert len(tags) == len(tags_predict)
    
    num_matched_total += len(list(filter(lambda x: x, [t1==t2 for t1,t2 in zip(tags,tags_predict)])))
    num_total += len(tags)

print('{0}/{1}={2}'.format(num_matched_total, num_total, float(num_matched_total)/num_total))

['w', 'nx', 'w', 'nx', 'w']
['w', 'nx', 'w', 'nx', 'w']
['r', 'v', 'ns', 'n', 'i', 'u', 'n', 'w', 'v', 'c', 'v', 'ns', 'n', 'v', 'n', 'a', 'u', 'n', 'w', 'v', 'c', 'v', 'ns', 'n', 'u', 'n', 'n', 'w', 'ad', 'v', 'ns', 'n', 'v', 'u', 'n', 'w', 'c', 'c', 'n', 'w', 'j', 'n', 'w', 'c', 'n', 'w', 'n', 'w', 'n', 'n', 'w', 'n', 'w', 'c', 'c', 'v', 'p', 'ns', 'u', 'r', 'n', 'n', 'c', 'f', 'n', 'n', 'w', 'c', 'v', 'p', 'ns', 'w', 'ns', 'c', 's', 'u', 'ns', 'n', 'w', 'c', 'c', 'v', 'v', 'n', 'n', 'c', 'n', 'vn', 'u', 'n', 'w', 'c', 'v', 'v', 'w', 'v', 'c', 'n', 'n', 'd', 'v', 'a', 'w', 'vn', 'n', 'u', 'n', 'w', 'r', 'd', 'd', 'p', 'r', 'ad', 'v', 'w', 'v', 'v', 'w', 'v', 'v', 'w', 'v', 'v', 'w', 't', 'p', 'w', 'j', 'w', 'u', 'n', 'v', 'n', 'n', 'vn', 'w', 'p', 'ns', 'n', 'u', 'a', 'n', 'c', 'a', 'n', 'v', 'r', 'vn', 'w', 'v', 'p', 'ns', 'p', 'w', 'j', 'w', 'u', 'n', 'f', 'v', 'an', 'an', 'w', 'd', 'a', 'u', 'ns', 'n', 'd', 'd', 'v', 'p', 'w', 'j', 'w', 'u', 'vn', 'c', 'vn', 'w']
['z', 'z', 'z', '

In [21]:
# POS Tagging v1.1: add sentences containing unknown words 

def is_known_word(w, all_tags):
    return len(list(
            filter(lambda s: s != -float('inf'), [prob_word_by_tag[t][w] for t in all_tags])
        )) > 0


def tagging(words, all_tags):
    N = len(words)
    mat = list()
    for _ in range(N):
        mat.append(defaultdict(float))
        
    back_trace = defaultdict(list)
    isknownword = is_known_word(words[0], all_tags)
    for t in all_tags:
        if isknownword:
            mat[0][t] = prob_word_by_tag[t][words[0]]
        else:
            mat[0][t] = 0.0
        back_trace[t] = [t]
        
    for idx,w in enumerate(words[1:]):
        idx += 1
        new_back_trace = defaultdict(list)
        
        isknownword = is_known_word(w, all_tags)
        for t in all_tags:
            if isknownword:
                score, pt = max((s+prob_word_by_tag[t][w]+prob_tag_by_tag[pt][t],pt) for pt,s in mat[idx-1].items())
                mat[idx][t] = score
                new_back_trace[t] = back_trace[pt] + [t]
            else:
                # for unknown words, use only the best subsequent tag following tag_prev
                score, pt = max((s+prob_tag_by_tag[pt][t],pt) for pt,s in mat[idx-1].items())    
                mat[idx][t] = score
                new_back_trace[t] = back_trace[pt] + [t]
        back_trace = new_back_trace
        
    max_score, final_tag = max((s,t) for t,s in mat[N-1].items())
    
    return max_score, back_trace[final_tag]
    

num_matched_total = 0
num_total = 0

cnt=10
for line in test_set:
    tokens = line.split()[1:] # ignore 0th timestamp
    if not tokens:
        continue
    words, tags = zip(*[tk.split('/') for tk in tokens])
    tags = list(map(lambda t: t.split(']')[0] if t.find(']')>=0 else t, tags)) # case: 大会堂/n]ns, ignore the sub-tag
    assert len(words) == len(tags)
            
    max_score, tags_predict = tagging(words, prob_tag_by_tag.keys())
    if cnt>0:
        print(tags)
        cnt -= 1
        print(tags_predict)
    assert len(tags) == len(tags_predict)
    
    num_matched_total += len(list(filter(lambda x: x, [t1==t2 for t1,t2 in zip(tags,tags_predict)])))
    num_total += len(tags)

print('{0}/{1}={2}'.format(num_matched_total, num_total, float(num_matched_total)/num_total))

['w', 'nx', 'w', 'nx', 'w']
['w', 'nx', 'w', 'nx', 'w']
['r', 'v', 'ns', 'n', 'i', 'u', 'n', 'w', 'v', 'c', 'v', 'ns', 'n', 'v', 'n', 'a', 'u', 'n', 'w', 'v', 'c', 'v', 'ns', 'n', 'u', 'n', 'n', 'w', 'ad', 'v', 'ns', 'n', 'v', 'u', 'n', 'w', 'c', 'c', 'n', 'w', 'j', 'n', 'w', 'c', 'n', 'w', 'n', 'w', 'n', 'n', 'w', 'n', 'w', 'c', 'c', 'v', 'p', 'ns', 'u', 'r', 'n', 'n', 'c', 'f', 'n', 'n', 'w', 'c', 'v', 'p', 'ns', 'w', 'ns', 'c', 's', 'u', 'ns', 'n', 'w', 'c', 'c', 'v', 'v', 'n', 'n', 'c', 'n', 'vn', 'u', 'n', 'w', 'c', 'v', 'v', 'w', 'v', 'c', 'n', 'n', 'd', 'v', 'a', 'w', 'vn', 'n', 'u', 'n', 'w', 'r', 'd', 'd', 'p', 'r', 'ad', 'v', 'w', 'v', 'v', 'w', 'v', 'v', 'w', 'v', 'v', 'w', 't', 'p', 'w', 'j', 'w', 'u', 'n', 'v', 'n', 'n', 'vn', 'w', 'p', 'ns', 'n', 'u', 'a', 'n', 'c', 'a', 'n', 'v', 'r', 'vn', 'w', 'v', 'p', 'ns', 'p', 'w', 'j', 'w', 'u', 'n', 'f', 'v', 'an', 'an', 'w', 'd', 'a', 'u', 'ns', 'n', 'd', 'd', 'v', 'p', 'w', 'j', 'w', 'u', 'vn', 'c', 'vn', 'w']
['r', 'v', 'ns', 

In [24]:
# POS Tagging v1.2: p(t|t_prev) in start or end place in a sentence

def is_known_word(w, all_tags):
    return len(list(
            filter(lambda s: s != -float('inf'), [prob_word_by_tag[t][w] for t in all_tags])
        )) > 0


def tagging(words, all_tags):
    N = len(words)
    mat = list()
    for _ in range(N):
        mat.append(defaultdict(float))
        
    back_trace = defaultdict(list)
    isknownword = is_known_word(words[0], all_tags)
    for t in all_tags:
        # adding start symbol
        if isknownword:
            mat[0][t] = prob_tag_by_tag['^'][t] + prob_word_by_tag[t][words[0]]
        else:
            mat[0][t] = prob_tag_by_tag['^'][t]
        back_trace[t] = [t]
        
    for idx,w in enumerate(words[1:]):
        idx += 1
        new_back_trace = defaultdict(list)
        
        isknownword = is_known_word(w, all_tags)
        for t in all_tags:
            if isknownword:
                score, pt = max((s+prob_word_by_tag[t][w]+prob_tag_by_tag[pt][t],pt) for pt,s in mat[idx-1].items())
                mat[idx][t] = score
                new_back_trace[t] = back_trace[pt] + [t]
            else:
                # for unknown words, use only the best subsequent tag following tag_prev
                score, pt = max((s+prob_tag_by_tag[pt][t],pt) for pt,s in mat[idx-1].items())    
                mat[idx][t] = score
                new_back_trace[t] = back_trace[pt] + [t]
        back_trace = new_back_trace
        
    # adding end symbol
    max_score, final_tag = max((s + prob_tag_by_tag[t]['$'],t) for t,s in mat[N-1].items())
    
    return max_score, back_trace[final_tag]
    

############## TRAIN ##############
f = '199801/199801.txt'
with open(f, 'rb') as fin:
    all = fin.read().decode('gbk').split('\n')

fold = 0.9
train_len = int(len(all) * fold)
train_set = all[:train_len]
test_set = all[train_len:]

from collections import defaultdict # matrix as incursive dict
prob_word_by_tag = dict() # emit matrix
prob_tag_by_tag = dict()  # tran matrix

prob_word_by_tag['^'] = defaultdict(lambda: -float('inf')) # no use, just place holder

for line in train_set:
    tokens = line.split()[1:] # ignore 0th timestamp
    prev_tag = '^' # start symbol
    for token in tokens:
        if token.strip() == '':
            continue
            
        word, tag = token.split('/')
        # case: 大会堂/n]ns
        if tag.find(']') != -1:
            tag = tag.split(']')[0] # ignore the sub-tag
        
        if not prob_word_by_tag.get(tag, None):
            prob_word_by_tag[tag] = defaultdict(lambda: -float('inf'))
        prob_word_by_tag[tag][word] = prob_word_by_tag[tag].get(word, 0) + 1
        
        if not prob_tag_by_tag.get(prev_tag, None):
            prob_tag_by_tag[prev_tag] = defaultdict(lambda: -float('inf'))
        prob_tag_by_tag[prev_tag][tag] = prob_tag_by_tag[prev_tag].get(tag, 0) + 1
        prev_tag = tag
        
    # end symbol
    if not prob_tag_by_tag.get(prev_tag, None):
            prob_tag_by_tag[prev_tag] = defaultdict(lambda: -float('inf'))
    prob_tag_by_tag[prev_tag]['$'] = prob_tag_by_tag[prev_tag].get('$', 0) + 1


# normalize
import math
import sys

# use log instead of original probability, to avoid the chain product of probs converges into zero
for tag,words_cnt in prob_word_by_tag.items():
    ntotal = sum([c for w,c in words_cnt.items()])
    for w,c in words_cnt.items():
        words_cnt[w] = math.log(float(c) / ntotal)
for prev_tag,tags_cnt in prob_tag_by_tag.items():
    ntotal = sum([c for t,c in tags_cnt.items()])
    for t,c in tags_cnt.items():
        tags_cnt[t] = math.log(float(c) / ntotal)
    
    
############## TEST ##############
num_matched_total = 0
num_total = 0

cnt=10
for line in test_set:
    tokens = line.split()[1:] # ignore 0th timestamp
    if not tokens:
        continue
    words, tags = zip(*[tk.split('/') for tk in tokens])
    tags = list(map(lambda t: t.split(']')[0] if t.find(']')>=0 else t, tags)) # case: 大会堂/n]ns, ignore the sub-tag
    assert len(words) == len(tags)
            
    max_score, tags_predict = tagging(words, prob_tag_by_tag.keys())
    if cnt>0:
        print(tags)
        cnt -= 1
        print(tags_predict)
    assert len(tags) == len(tags_predict)
    
    num_matched_total += len(list(filter(lambda x: x, [t1==t2 for t1,t2 in zip(tags,tags_predict)])))
    num_total += len(tags)

print('{0}/{1}={2}'.format(num_matched_total, num_total, float(num_matched_total)/num_total))

['w', 'nx', 'w', 'nx', 'w']
['w', 'nx', 'w', 'nx', 'w']
['r', 'v', 'ns', 'n', 'i', 'u', 'n', 'w', 'v', 'c', 'v', 'ns', 'n', 'v', 'n', 'a', 'u', 'n', 'w', 'v', 'c', 'v', 'ns', 'n', 'u', 'n', 'n', 'w', 'ad', 'v', 'ns', 'n', 'v', 'u', 'n', 'w', 'c', 'c', 'n', 'w', 'j', 'n', 'w', 'c', 'n', 'w', 'n', 'w', 'n', 'n', 'w', 'n', 'w', 'c', 'c', 'v', 'p', 'ns', 'u', 'r', 'n', 'n', 'c', 'f', 'n', 'n', 'w', 'c', 'v', 'p', 'ns', 'w', 'ns', 'c', 's', 'u', 'ns', 'n', 'w', 'c', 'c', 'v', 'v', 'n', 'n', 'c', 'n', 'vn', 'u', 'n', 'w', 'c', 'v', 'v', 'w', 'v', 'c', 'n', 'n', 'd', 'v', 'a', 'w', 'vn', 'n', 'u', 'n', 'w', 'r', 'd', 'd', 'p', 'r', 'ad', 'v', 'w', 'v', 'v', 'w', 'v', 'v', 'w', 'v', 'v', 'w', 't', 'p', 'w', 'j', 'w', 'u', 'n', 'v', 'n', 'n', 'vn', 'w', 'p', 'ns', 'n', 'u', 'a', 'n', 'c', 'a', 'n', 'v', 'r', 'vn', 'w', 'v', 'p', 'ns', 'p', 'w', 'j', 'w', 'u', 'n', 'f', 'v', 'an', 'an', 'w', 'd', 'a', 'u', 'ns', 'n', 'd', 'd', 'v', 'p', 'w', 'j', 'w', 'u', 'vn', 'c', 'vn', 'w']
['r', 'v', 'ns', 