In [11]:
import numpy as np
from collections import defaultdict


class CRF:
    def __init__(self, feature_templates, tags):
        """
        :param feature_templates: 特征模板列表
        :param tags: 标签列表
        """
        self.feature_templates = feature_templates
        self.tags = tags
        self.tag_to_idx = {tag: i for i, tag in enumerate(tags)}
        self.idx_to_tag = {i: tag for tag, i in self.tag_to_idx.items()}

        # 模型参数
        self.weights = defaultdict(float)  # 特征权重
        self.transition = np.zeros((len(tags), len(tags)))  # 转移矩阵

    def extract_features(self, sentence, position, prev_tag, current_tag):
        """根据特征模板提取特征"""
        features = []
        for template in self.feature_templates:
            # 处理Unigram特征 U08:%x[0,0]/%x[1,0]
            if template.startswith("U"):
                parts = template.split(":")[1].split("/")
                context = []
                for part in parts:
                    offset = int(part[3:-1].split(",")[0])
                    idx = position + offset

                    if idx < 0:
                        context.append("<BEG>")
                    elif idx >= len(sentence):
                        context.append("<END>")
                    else:
                        context.append(sentence[idx])
                features.append(f"{current_tag}::{template}:{'/'.join(context)}")

            # 处理Bigram特征 B08:%x[0,0]/%x[1,0]
            elif template.startswith("B"):
                if prev_tag is not None:
                    parts = template.split(":")[1].split("/")
                    context = []
                    for part in parts:
                        offset = int(part[3:-1].split(",")[0])
                        idx = position + offset
                        if idx < 0:
                            context.append("<BEG>")
                        elif idx >= len(sentence):
                            context.append("<END>")
                        else:
                            context.append(sentence[idx])
                    features.append(f"{prev_tag}→{current_tag}::{template}:{'/'.join(context)}")
        return features

    def forward_backward(self, sentence):
        """
        前向-后向算法计算特征期望
        :param sentence: 输入句子（单词列表）
        :return: (alpha, beta, log_Z)
        """
        T = len(sentence)
        N = len(self.tags)
        
        # 初始化
        alpha = np.zeros((T, N))
        beta = np.zeros((T, N))
        
        # 前向算法
        for t in range(T):
            for j in range(N):
                if t == 0:
                    # 初始状态
                    features = self.extract_features(sentence, t, None, self.idx_to_tag[j])
                    score = sum(self.weights[f] for f in features)
                    alpha[t][j] = score
                else:
                    sum_exp = 0
                    for i in range(N):
                        trans_score = self.transition[i][j]
                        features = self.extract_features(sentence, t, self.idx_to_tag[i], self.idx_to_tag[j])
                        emit_score = sum(self.weights[f] for f in features)
                        sum_exp += np.exp(alpha[t-1][i] + trans_score + emit_score)
                    alpha[t][j] = np.log(sum_exp) if sum_exp > 0 else -np.inf
        
        # 后向算法
        for t in reversed(range(T)):
            for i in range(N):
                if t == T - 1:
                    beta[t][i] = 0  # 终止状态
                else:
                    sum_exp = 0
                    for j in range(N):
                        trans_score = self.transition[i][j]
                        features = self.extract_features(sentence, t+1, self.idx_to_tag[i], self.idx_to_tag[j])
                        emit_score = sum(self.weights[f] for f in features)
                        sum_exp += np.exp(trans_score + emit_score + beta[t+1][j])
                    beta[t][i] = np.log(sum_exp) if sum_exp > 0 else -np.inf
        
        # 计算配分函数
        log_Z = np.log(sum(np.exp(alpha[-1]))) if any(alpha[-1] != -np.inf) else -np.inf
        
        return alpha, beta, log_Z

    def compute_gradient(self, sentence, true_tags):
        """计算梯度"""
        # 提取真实路径特征
        true_features = set()
        for t in range(len(sentence)):
            prev_tag = true_tags[t-1] if t > 0 else None
            features = self.extract_features(sentence, t, prev_tag, true_tags[t])
            true_features.update(features)
        
        # 计算模型期望特征（修正：只传入sentence）
        alpha, beta, log_Z = self.forward_backward(sentence)
        T = len(sentence)
        expected_features = defaultdict(float)
        
        for t in range(T):
            for i in range(len(self.tags)):
                for j in range(len(self.tags)):
                    if t == 0:
                        prev_tag = None
                    else:
                        prev_tag = self.idx_to_tag[i]
                    
                    current_tag = self.idx_to_tag[j]
                    features = self.extract_features(sentence, t, prev_tag, current_tag)
                    
                    # 计算概率
                    if t == 0:
                        prob = np.exp(alpha[t][j] + beta[t][j] - log_Z) if log_Z != -np.inf else 0
                    else:
                        trans_score = self.transition[i][j]
                        emit_score = sum(self.weights[f] for f in features)
                        prob = np.exp(alpha[t-1][i] + trans_score + emit_score + beta[t][j] - log_Z) if log_Z != -np.inf else 0
                    
                    # 累加特征期望
                    for f in features:
                        expected_features[f] += prob
        
        # 计算梯度
        gradient = defaultdict(float)
        for f in true_features:
            gradient[f] += 1  # 真实特征计数
        for f in expected_features:
            gradient[f] -= expected_features[f]  # 模型期望
        
        return gradient, log_Z

    def train(self, sentences, true_tag_sequences, max_iter=10, learning_rate=0.1):
        """训练CRF模型"""
        for iteration in range(max_iter):
            total_loss = 0
            for sentence, tags in zip(sentences, true_tag_sequences):
                # 计算梯度和损失
                gradient, log_Z = self.compute_gradient(sentence, tags)

                # 更新权重
                for f in gradient:
                    self.weights[f] += learning_rate * gradient[f]

                # 计算真实路径得分
                true_score = 0
                for t in range(len(sentence)):
                    prev_tag = tags[t - 1] if t > 0 else None
                    features = self.extract_features(sentence, t, prev_tag, tags[t])
                    true_score += sum(self.weights[f] for f in features)
                    if t > 0:
                        i = self.tag_to_idx[tags[t - 1]]
                        j = self.tag_to_idx[tags[t]]
                        true_score += self.transition[i][j]

                # 累加损失
                total_loss += log_Z - true_score

            print(f"Iteration {iteration}, Loss: {total_loss/len(sentences)}")

    def viterbi_decode(self, sentence):
        """Viterbi算法解码"""
        T = len(sentence)
        N = len(self.tags)

        # 初始化
        viterbi = np.zeros((T, N))
        backpointers = np.zeros((T, N), dtype=int)

        # 初始步
        for j in range(N):
            features = self.extract_features(sentence, 0, None, self.idx_to_tag[j])
            viterbi[0][j] = sum(self.weights[f] for f in features)

        # 递推
        for t in range(1, T):
            for j in range(N):
                max_score = -np.inf
                best_tag = 0
                for i in range(N):
                    trans_score = self.transition[i][j]
                    features = self.extract_features(sentence, t, self.idx_to_tag[i], self.idx_to_tag[j])
                    emit_score = sum(self.weights[f] for f in features)
                    score = viterbi[t - 1][i] + trans_score + emit_score
                    if score > max_score:
                        max_score = score
                        best_tag = i
                viterbi[t][j] = max_score
                backpointers[t][j] = best_tag

        # 回溯
        best_path = [np.argmax(viterbi[-1])]
        for t in reversed(range(1, T)):
            best_path.append(backpointers[t][best_path[-1]])
        best_path.reverse()

        return [self.idx_to_tag[i] for i in best_path]


# 示例使用
# 定义特征模板
feature_templates = [
    "U00:%x[-2,0]",
    "U01:%x[-1,0]",
    "U02:%x[0,0]",
    "U03:%x[1,0]",
    "U04:%x[2,0]",
    "U05:%x[-2,0]/%x[-1,0]",
    "U06:%x[-1,0]/%x[0,0]",
    "U07:%x[-1,0]/%x[1,0]",
    "U08:%x[0,0]/%x[1,0]",
    "U09:%x[1,0]/%x[2,0]",
]

# 定义标签集
tags = ["O", "B-NAME", "I-NAME", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

# 初始化CRF
crf = CRF(feature_templates, tags)

# 训练数据示例
train_sentences = [["张", "三", "在", "北京", "工作"], ["李", "四", "是", "腾讯", "员工"]]
train_tags = [["B-NAME", "I-NAME", "O", "B-LOC", "O"], ["B-NAME", "I-NAME", "O", "B-ORG", "O"]]

# 训练模型
crf.train(train_sentences, train_tags, max_iter=50)

# 预测新句子
test_sentence = ["王", "五", "来自", "上海"]
predicted_tags = crf.viterbi_decode(test_sentence)
print("预测结果:", list(zip(test_sentence, predicted_tags)))

Iteration 0, Loss: 6.344877557594868
Iteration 1, Loss: 4.212741980601052
Iteration 2, Loss: 2.958379688950451
Iteration 3, Loss: 2.5774879850168917
Iteration 4, Loss: 2.465733598601348
Iteration 5, Loss: 2.358909231774292
Iteration 6, Loss: 2.3489671006052513
Iteration 7, Loss: 2.3193085228320305
Iteration 8, Loss: 2.307462776857801
Iteration 9, Loss: 2.30196952852915
Iteration 10, Loss: 2.2933017430975684
Iteration 11, Loss: 2.2908562881073937
Iteration 12, Loss: 2.2870125057730535
Iteration 13, Loss: 2.284607761580781
Iteration 14, Loss: 2.2828926062658046
Iteration 15, Loss: 2.28114039884063
Iteration 16, Loss: 2.2800066395394145
Iteration 17, Loss: 2.278914452616242
Iteration 18, Loss: 2.27804057008705
Iteration 19, Loss: 2.2773135240320537
Iteration 20, Loss: 2.2766628336976553
Iteration 21, Loss: 2.2761230381940383
Iteration 22, Loss: 2.2756417992257987
Iteration 23, Loss: 2.275222947220656
Iteration 24, Loss: 2.274854428011335
Iteration 25, Loss: 2.2745252291664055
Iteration 26