In [1]:
import numpy as np
from collections import defaultdict

def estimate_hmm_parameters(train_file):
    # 初始化统计变量
    pi = defaultdict(int)  # 初始概率计数
    A = defaultdict(lambda: defaultdict(int))  # 转移概率计数
    B = defaultdict(lambda: defaultdict(int))  # 发射概率计数

    vocab = set()  # 词汇表
    tag_counts = defaultdict(int)  # 标签总出现次数
    total_sentences = 0  # 总句子数

    with open(train_file, "r", encoding="utf-8") as f:
        current_sentence = []
        for line in f:
            line = line.strip()
            if not line:
                # 处理一个完整句子
                if current_sentence:
                    total_sentences += 1
                    first_word, first_tag = current_sentence[0]
                    pi[first_tag] += 1

                    # 统计转移和发射概率
                    for i in range(len(current_sentence)):
                        word, tag = current_sentence[i]
                        vocab.add(word)
                        tag_counts[tag] += 1
                        B[tag][word] += 1

                        if i > 0:
                            prev_tag = current_sentence[i - 1][1]
                            A[prev_tag][tag] += 1
                    current_sentence = []
            else:
                # 解析单词和标签
                parts = line.split()
                if len(parts) >= 2:
                    word, tag = parts[0].lower(), parts[1]
                    current_sentence.append((word, tag))

    # 计算概率（加一平滑）
    vocab_size = len(vocab)
    num_tags = len(tag_counts)

    # 初始概率
    pi_prob = {tag: (pi.get(tag, 0) + 1) / (total_sentences + num_tags) for tag in tag_counts}

    # 转移概率
    A_prob = {}
    for prev_tag in tag_counts:
        A_prob[prev_tag] = {}
        total_transitions = sum(A[prev_tag].values())
        for tag in tag_counts:
            A_prob[prev_tag][tag] = (A[prev_tag].get(tag, 0) + 1) / (total_transitions + num_tags)

    # 发射概率
    B_prob = {}
    for tag in tag_counts:
        B_prob[tag] = {}
        total_words = tag_counts[tag]
        for word in vocab:
            B_prob[tag][word] = (B[tag].get(word, 0) + 1) / (total_words + vocab_size)

    return pi_prob, A_prob, B_prob

In [2]:
def viterbi_decode(obs_sequence, states, pi, A, B):
    T = len(obs_sequence)
    N = len(states)

    delta = np.zeros((T, N))  # 最优概率
    psi = np.zeros((T, N), dtype=int)  # 最优路径

    # Step 1: 初始化首单词概率
    for i, s in enumerate(states):
        delta[0][i] = pi.get(s, 1e-6) * B.get(s, {}).get(obs_sequence[0], 1e-6)

    # Step 2: 动态规划
    for t in range(1, T):
        for j, s in enumerate(states):
            max_prob, best_prev = -1.0, 0
            for i, prev_s in enumerate(states):
                # 检查标签的合法性
                if prev_s.startswith("B-") and not (s.startswith("M-") or s.startswith("E-")):
                    continue
                if prev_s.startswith("M-") and not (s.startswith("M-") or s.startswith("E-")):
                    continue
                if prev_s.startswith("E-") and (s.startswith("M-") or s.startswith("E-")):
                    continue

                prob = delta[t - 1][i] * A.get(prev_s, {}).get(s, 1e-6) * B.get(s, {}).get(obs_sequence[t], 1e-6)

                if prob > max_prob:
                    max_prob, best_prev = prob, i

            delta[t][j] = max_prob
            psi[t][j] = best_prev

    # Step 3: 回溯最优路径
    best_path = [np.argmax(delta[-1])]
    for t in range(T - 1, 0, -1):
        best_path.insert(0, psi[t][best_path[0]])

    return [states[i] for i in best_path]


# 读取验证集并处理句子分隔
def process_validation_file(input_file, output_file, states, pi, A, B):
    current_sentence = []
    with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                # 处理一个完整句子
                if current_sentence:
                    words = [word.lower() for word, _ in current_sentence]
                    predicted_tags = viterbi_decode(words, states, pi, A, B)
                    for (word, _), tag in zip(current_sentence, predicted_tags):
                        fout.write(f"{word} {tag}\n")
                    fout.write("\n")
                    current_sentence = []
            else:
                # 非空行，读取单词
                parts = line.split()
                word = parts[0]
                current_sentence.append((word, None))

In [3]:
states = ["O", "B-NAME", "M-NAME", "E-NAME", "S-NAME", "B-CONT", "M-CONT", "E-CONT", "S-CONT", "B-EDU", "M-EDU", "E-EDU", "S-EDU", "B-TITLE", "M-TITLE", "E-TITLE", "S-TITLE", "B-ORG", "M-ORG", "E-ORG", "S-ORG", "B-RACE", "M-RACE", "E-RACE", "S-RACE", "B-PRO", "M-PRO", "E-PRO", "S-PRO", "B-LOC", "M-LOC", "E-LOC", "S-LOC"]
pi, A, B = estimate_hmm_parameters("Chinese/train.txt")
process_validation_file("Chinese/validation.txt", "Chinese/validation_HMM.txt", states, pi, A, B)

In [6]:
states = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
pi, A, B = estimate_hmm_parameters("English/train.txt")
process_validation_file("English/validation.txt", "English/validation_HMM.txt", states, pi, A, B)