In [2]:
"""
Created on the 13th June 2018

@author : woshihaozhaojun@sina.com
"""
import numpy as np

def viterbi_algo(text, transition, emission, ind2tag, word2ind ):
    '''
    pos tagger的viterbi算法
    
    Args:
        text(iterables)      :- 要估计的句子的序列,长度为l
        transition(np.array) :- 词性转换矩阵， 
                                [i,j]元素表示从i词性到j词性的概率，
                                维度为[n ,n], n为词性的种类数
        emission(np.array)   :- 产生词的概率矩阵,
                                [i,k]元素表示i词性生成k词的概率,
                                维度为[n, v], v为字典的大小
        ind2tag(iterables)   :- 第i个元素为i词性
        word2ind(dict)       :- k词为key，序号为value
    Returns:
        paths(np.array)      :- [i,w]元素表示第w词为i词性时上一个词的词性，
                                维度为[n, l]
        viterbi(np.array)    :- [i,w]元素表示第w词为i词性的概率,
                                维度为[n,l]
    
    '''
    cols = len(text) 
    rows = transition.shape[0]

    paths = np.zeros(( rows, cols))
    viterbi = np.zeros((rows, cols))
    viterbi[0,0] = 1
    for j in range(1,cols):
        for i in range(rows):
            prob = viterbi[:,j-1] *  transition[:,i]* emission[i, word2ind[ text[j]] ] # [cols, 1]
            sort = np.argsort(prob)
            paths[i,j] =  sort[-1]
            viterbi[i,j] = max(prob)
            
    last =  int(np.argsort(  viterbi[:, j] )[-1]) # 最后一个词的概率最大的行序

    print(
        "词为{}, 词性为{}, 概率为{}%".format( text[-1], ind2tag[last], viterbi[last, j]*100) 
    )
    for j in range(cols-1):
        last = int(paths[ last, cols -1 -j    ]) # 上一个词的行序
        
        print( 
            "词为{}, 词性为{}, 概率为{}%".format( text[-2-j], ind2tag[last],  viterbi[ last  ,cols -2 - j]*100)
        )

    return paths, viterbi

def demo():
    text = ['b', 'I', 'love', 'you','love','I'] 

    ind2tag = ['b', 'sub', 'verb','obj'] # 开头，主语，动词，宾语

    transition = np.array( 
        [
            [0, 0.4, 0.6,  0], # 从b到 sub, verb, obj
            [0, 0,   0.85, 0.15],
            [0, 0.3, 0,    0.7],
            [0, 0.3, 0.3,  0.4]
        ]
    )

    emission = np.array(
        [
            [1, 0 , 0, 0],
            [0, 0.4, 0, 0.6],
            [0, 0.45 , 0.55 , 0],
            [0, 0 , 0 , 1]
        ]

    )

    word2ind = {
        'b' : 0,
        'I' : 1,
        'love' :2,
        'you' : 3
    }
    paths, viterbi = viterbi_algo(text, transition, emission, ind2tag,word2ind)

if __name__ =="__main__":
    demo()

词为I, 词性为sub, 概率为0.1036728%
词为love, 词性为verb, 概率为0.86394%
词为you, 词性为obj, 概率为5.236000000000001%
词为love, 词性为verb, 概率为7.48%
词为I, 词性为sub, 概率为16.000000000000004%
词为b, 词性为b, 概率为100.0%
