https://www.cnblogs.com/xlturing/p/8467033.html

https://github.com/xlturing/machine-learning-journey/tree/master/seg_hmm

In [26]:
# -*- coding: utf-8 -*-
import sys
import re
import getopt
import json

MIN_FLOAT = -3.14e100

PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"

PrevStatus = {
    'B': 'ES',
    'M': 'MB',
    'S': 'SE',
    'E': 'BM'
}

Force_Split_Words = set([])

from prob_start import P as start_P
from prob_trans import P as trans_P
from prob_emit import P as emit_P

### 初始状态矩阵 π

In [27]:
print(json.dumps(start_P, indent=4,ensure_ascii=False, sort_keys=False,separators=(',', ':')))

{
    "B":-0.26268660809250016,
    "E":-3.14e+100,
    "M":-3.14e+100,
    "S":-1.4652633398537678
}


### 状态转移矩阵 A

In [28]:
print(json.dumps(trans_P, indent=4,ensure_ascii=False, sort_keys=False,separators=(',', ':')))

{
    "B":{
        "E":-0.51082562376599,
        "M":-0.916290731874155
    },
    "E":{
        "B":-0.5897149736854513,
        "S":-0.8085250474669937
    },
    "M":{
        "E":-0.33344856811948514,
        "M":-1.2603623820268226
    },
    "S":{
        "B":-0.7211965654669841,
        "S":-0.6658631448798212
    }
}


### 发射概率矩阵 B

In [55]:
cnt=0
for key,value in emit_P.items():
    cnt += 1
    if cnt>2:
        break
    print("{}:{}".format(key,value))

B:{'一': -3.6544978750449433, '丁': -8.125041941842026, '七': -7.817392401429855, '万': -6.3096425804013165, '丈': -8.866689067453933, '三': -5.932085850549891, '上': -5.739552583325728, '下': -5.997089097239644, '不': -4.274262055936421, '与': -8.355569307500769, '丐': -9.985251083961709, '丑': -10.200388187382178, '专': -6.373950868459459, '且': -10.194028865473886, '世': -6.142051005803147, '丘': -8.88385596143092, '丙': -10.895131537474946, '业': -7.835434989100892, '丛': -9.507241006822232, '东': -6.004002008087405, '丝': -8.859361493580714, '丞': -10.149094892994627, '丟': -9.766619303111161, '丢': -9.766619303111161, '两': -5.616014902721218, '严': -6.980651489152666, '並': -7.383967650611162, '丧': -9.218662207363717, '个': -7.167361665594661, '丫': -9.55250867414066, '中': -4.596743315282086, '丰': -7.604157975048104, '串': -10.424728626930234, '临': -7.845071440995644, '丸': -11.936319207057291, '丹': -8.755441313832726, '为': -6.33937311455966, '主': -5.35605669160454, '丽': -9.720663223766415, '举': -7.0566759769

In [50]:
def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]  # tabular
    path = {}
    for y in states:  # init
        V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
        path[y] = [y]
    for t in range(1, len(obs)):
        V.append({})
        newpath = {}
        for y in states:
            em_p = emit_p[y].get(obs[t], MIN_FLOAT)
            (prob, state) = max(
                [(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
            V[t][y] = prob
            newpath[y] = path[state] + [y]
        path = newpath

    (prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')

    return (prob, path[state])


def __cut(sentence):
    prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P)
    begin, nexti = 0, 0
    print (prob)
    print (pos_list)
    for i, char in enumerate(sentence):
        pos = pos_list[i]
        if pos == 'B':
            begin = i
        elif pos == 'E':
            yield sentence[begin:i + 1]
            nexti = i + 1
        elif pos == 'S':
            yield char
            nexti = i + 1
    if nexti < len(sentence):
        yield sentence[nexti:]

re_han = re.compile("([\u4E00-\u9FD5]+)")
re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")

def cut(sentence):
    #sentence = sentence.strip().decode('utf-8')
    blocks = re_han.split(sentence)
    #print (blocks)
    lseg = []
    for blk in blocks:
        if re_han.match(blk):
            for word in __cut(blk):
                if word not in Force_Split_Words:
                    lseg.append(word)
                else:
                    for c in word:
                        lseg.append(c)
                print (lseg)
        else:
            tmp = re_skip.split(blk)
            for x in tmp:
                if x:
                    lseg.append(x)
                print (lseg)
    return lseg

In [51]:
cut("小明硕士毕业于中国科学院计算所")

[]
-101.63238958952303
['B', 'E', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'S']
['小明']
['小明', '硕士']
['小明', '硕士', '毕业于']
['小明', '硕士', '毕业于', '中国']
['小明', '硕士', '毕业于', '中国', '科学院']
['小明', '硕士', '毕业于', '中国', '科学院', '计算']
['小明', '硕士', '毕业于', '中国', '科学院', '计算', '所']
['小明', '硕士', '毕业于', '中国', '科学院', '计算', '所']


['小明', '硕士', '毕业于', '中国', '科学院', '计算', '所']

In [52]:
cut("上海计划到本世纪末实现人居国内生产总值五千美元")

[]
-156.64826994522744
['B', 'M', 'M', 'E', 'S', 'B', 'M', 'E', 'S', 'B', 'E', 'B', 'M', 'E', 'S', 'B', 'E', 'B', 'E', 'B', 'M', 'M', 'E']
['上海计划']
['上海计划', '到']
['上海计划', '到', '本世纪']
['上海计划', '到', '本世纪', '末']
['上海计划', '到', '本世纪', '末', '实现']
['上海计划', '到', '本世纪', '末', '实现', '人居国']
['上海计划', '到', '本世纪', '末', '实现', '人居国', '内']
['上海计划', '到', '本世纪', '末', '实现', '人居国', '内', '生产']
['上海计划', '到', '本世纪', '末', '实现', '人居国', '内', '生产', '总值']
['上海计划', '到', '本世纪', '末', '实现', '人居国', '内', '生产', '总值', '五千美元']
['上海计划', '到', '本世纪', '末', '实现', '人居国', '内', '生产', '总值', '五千美元']


['上海计划', '到', '本世纪', '末', '实现', '人居国', '内', '生产', '总值', '五千美元']

In [54]:
cut('姚明在CBA打篮球')

[]
-20.06035827781581
['B', 'E', 'S']
['姚明']
['姚明', '在']
['姚明', '在']
['姚明', '在', 'CBA']
['姚明', '在', 'CBA']
-25.52699866443856
['B', 'M', 'E']
['姚明', '在', 'CBA', '打篮球']
['姚明', '在', 'CBA', '打篮球']


['姚明', '在', 'CBA', '打篮球']