In [1]:
import numpy as np

In [2]:
states = ('B', 'M', 'E', 'S')

In [3]:
def incr_vector_counter(counter, state):
    if state not in counter:
        counter[state] = 1
    else:
        counter[state] += 1

def incr_matrix_counter(counter, start, end):
    if start not in counter:
        counter[start] = {end: 1}
    elif end not in counter[start]:
        counter[start][end] = 1
    else:
        counter[start][end] += 1

In [4]:
def feed_train_line(line, pi_counter, trans_counter, emit_counter):
    words = [i.strip() for i in line.split() if i.strip()]

    if len(words) == 0:
        return

    line_states = []
    for word in words:
        cnt = len(word)
        if cnt == 1:
            line_states.append('S')
        else:
            line_states.append('B')
            line_states.extend(['M'] * (cnt - 2))
            line_states.append('E')
#     print(line.strip())
#     print(line_states)

    # for start prob
    incr_vector_counter(pi_counter, line_states[0])

    # for transition prob
    for start, end in zip(line_states[:-1], line_states[1:]):
        incr_matrix_counter(trans_counter, start, end)

    # for emission prob
    for s, o in zip(line_states, ''.join(words)):
        incr_matrix_counter(emit_counter, s, o)

In [5]:
# processed = 0

start_counter = {}
transition_counter = {}
emission_counter = {}
vocabulary = set()

# processed = 0
with open('../data/pku_training.utf8') as f:
    for line in f:
        feed_train_line(line, start_counter, transition_counter, emission_counter)
        vocabulary.update(line)
#         processed += 1
#         if processed > 10:
#             break

print('start_counter: ', start_counter)
print('transition_counter: ', transition_counter)
print('vocabulary cap: ', len(vocabulary))
# print('emission_counter: ', emission_counter['S'])

start_counter:  {'B': 12091, 'S': 6963}
transition_counter:  {'B': {'E': 499329, 'M': 85897}, 'E': {'B': 282224, 'S': 299226}, 'S': {'S': 218532, 'B': 290911}, 'M': {'M': 45378, 'E': 85897}}
vocabulary cap:  4700


In [6]:
def generate_index_map(lables):
    id2label = {}
    label2id = {}
    for idx, label in enumerate(lables):
        id2label[idx] = label
        label2id[label] = idx
    return id2label, label2id
 
states_id2label, states_label2id = generate_index_map(states)
vocabulary_id2label, vocabulary_label2id = generate_index_map(vocabulary)
print(states_id2label, states_label2id)
# print(vocabulary_id2label, vocabulary_label2id)

{0: 'B', 1: 'M', 2: 'E', 3: 'S'} {'B': 0, 'M': 1, 'E': 2, 'S': 3}


In [7]:
def convert_1d_counter_to_prob(counter_map):
    total = sum(counter_map.values())
    return {k: 1.0 * v / total for k, v in counter_map.items()}


def convert_2d_counter_to_prob(counter_map):
    return {k: convert_1d_counter_to_prob(v) for k, v in counter_map.items()}


start_probability = convert_1d_counter_to_prob(start_counter)
transition_probability = convert_2d_counter_to_prob(transition_counter)
emission_probability = convert_2d_counter_to_prob(emission_counter)

print('start_probability: ', start_probability)
print('transition_probability: ', transition_probability)

start_probability:  {'B': 0.6345649207515482, 'S': 0.36543507924845176}
transition_probability:  {'B': {'E': 0.8532242244876338, 'M': 0.14677577551236617}, 'E': {'B': 0.48537965431249463, 'S': 0.5146203456875054}, 'S': {'S': 0.42896261210773334, 'B': 0.5710373878922667}, 'M': {'M': 0.3456713007046277, 'E': 0.6543286992953723}}


In [8]:
def convert_map_to_vector(probs_map, col_label2id):
    """conver to 1d vector"""
    v = np.zeros(len(col_label2id), dtype=float)
    for col, value in probs_map.items():
        v[col_label2id[col]] = value
    return v


def convert_map_to_matrix(probs_map, row_label2id, col_label2id):
    """convert to matrix"""
    m = np.zeros((len(row_label2id), len(col_label2id)), dtype=float)
    for row, cols in probs_map.items():
        for col, value in cols.items():
            m[row_label2id[row]][col_label2id[col]] = value
    return m

print(states_id2label)
A = convert_map_to_matrix(transition_probability, states_label2id, states_label2id)
print('状态转移矩阵 A: ', A)
pi = convert_map_to_vector(start_probability, states_label2id)
print('初始状态概率向量 π: ', pi)
B = convert_map_to_matrix(emission_probability, states_label2id, vocabulary_label2id)
print('观测概率矩阵 B: ', B)

{0: 'B', 1: 'M', 2: 'E', 3: 'S'}
状态转移矩阵 A:  [[0.         0.14677578 0.85322422 0.        ]
 [0.         0.3456713  0.6543287  0.        ]
 [0.48537965 0.         0.         0.51462035]
 [0.57103739 0.         0.         0.42896261]]
初始状态概率向量 π:  [0.63456492 0.         0.         0.36543508]
观测概率矩阵 B:  [[1.60963457e-03 2.92194810e-04 2.05048990e-05 ... 3.41748316e-06
  4.95535058e-05 2.05048990e-05]
 [1.63778328e-03 2.66615883e-04 5.33231765e-05 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.19611911e-03 6.83496632e-05 6.83496632e-06 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.73425497e-04 6.47963394e-05 7.62309875e-06 ... 1.90577469e-06
  1.90577469e-06 1.90577469e-06]]


In [9]:
import json
with open('./hmm-zh-segmentation.json', 'w', encoding='utf8') as f:
    json.dump({
        'states': states,
        'start_probability': start_probability,
        'transition_probability': transition_probability,
        'emission_probability': emission_probability,
    }, f, indent=4)

初始概率向量 pi
转移矩阵 A
观测概率矩阵 B
状态 states

均已知，给出 observations 即可用 Viterbi 算法计算最匹配 state 序列。

In [10]:
observations = '小明是个好学生'
observations_index = [vocabulary_label2id[o] for o in observations]
print(observations_index)

[1108, 2689, 505, 4547, 756, 1473, 3139]


In [11]:
B_partial = np.zeros([len(states), len(observations_index)], dtype=float)
for i, o in enumerate(observations_index):
    B_partial[:, i] = B[:, o]

print('初始概率 pi')
print(', '.join(map(lambda x: '%.4f' % x, pi)))
print('转移矩阵 A')
for row in A:
    print(', '.join(map(lambda x: '%.4f' % x, row)))
print('观测矩阵 B')
for row in B_partial:
    print(', '.join(map(lambda x: '%.4f' % x, row)))

初始概率 pi
0.6346, 0.0000, 0.0000, 0.3654
转移矩阵 A
0.0000, 0.1468, 0.8532, 0.0000
0.0000, 0.3457, 0.6543, 0.0000
0.4854, 0.0000, 0.0000, 0.5146
0.5710, 0.0000, 0.0000, 0.4290
观测矩阵 B
0.0027, 0.0015, 0.0003, 0.0008, 0.0006, 0.0031, 0.0048
0.0034, 0.0012, 0.0002, 0.0006, 0.0005, 0.0051, 0.0029
0.0003, 0.0028, 0.0022, 0.0052, 0.0020, 0.0030, 0.0038
0.0011, 0.0001, 0.0191, 0.0050, 0.0024, 0.0003, 0.0001


In [12]:
observations = '清华大学'
observations_index = [vocabulary_label2id[o] for o in observations]
print(observations_index)

[1070, 552, 2884, 1473]


In [13]:
B_partial = np.zeros([len(states), len(observations_index)], dtype=float)
for i, o in enumerate(observations_index):
    B_partial[:, i] = B[:, o]

print('初始概率 pi')
print(', '.join(map(lambda x: '%.4f' % x, pi)))
print('转移矩阵 A')
for row in A:
    print(', '.join(map(lambda x: '%.4f' % x, row)))
print('观测矩阵 B')
for row in B_partial:
    print(', '.join(map(lambda x: '%.4f' % x, row)))

初始概率 pi
0.6346, 0.0000, 0.0000, 0.3654
转移矩阵 A
0.0000, 0.1468, 0.8532, 0.0000
0.0000, 0.3457, 0.6543, 0.0000
0.4854, 0.0000, 0.0000, 0.5146
0.5710, 0.0000, 0.0000, 0.4290
观测矩阵 B
0.0009, 0.0013, 0.0086, 0.0031
0.0003, 0.0108, 0.0055, 0.0051
0.0005, 0.0013, 0.0049, 0.0030
0.0001, 0.0002, 0.0042, 0.0003
