In [1]:
import dill
import numpy as np
from collections import defaultdict

In [2]:
class CRF:
    def __init__(self, tags, feature_templates):
        """
        :param feature_templates: 特征模板列表
        :param tags: 标签列表
        """
        self.tags = tags
        self.feature_templates = feature_templates
        self.tag_to_idx = {tag: i for i, tag in enumerate(tags)}
        self.idx_to_tag = {i: tag for tag, i in self.tag_to_idx.items()}

        # 模型参数
        self.weights = defaultdict(lambda: np.random.randn() * 0.01)  # 特征权重
        self.transition = np.random.randn(len(tags), len(tags)) * 0.01  # 转移矩阵

    def legal_transition(self, i, j):
        i_tag = self.idx_to_tag[i]
        j_tag = self.idx_to_tag[j]
        if i_tag.startswith("B-") and not (j_tag.startswith("M-") or j_tag.startswith("E-")):
            return False
        if i_tag.startswith("M-") and not (j_tag.startswith("M-") or j_tag.startswith("E-")):
            return False
        if i_tag.startswith("E-") and (j_tag.startswith("M-") or j_tag.startswith("E-")):
            return False
        if i_tag.startswith("S-") and (j_tag.startswith("M-") or j_tag.startswith("E-")):
            return False
        return True

    @staticmethod
    def extract_features_helper(sentence, position, template):
        """提取特征模板中的偏移量"""
        context = []  # U08:%x[0,0]/%x[1,0]
        parts = template.split(":")[1].split("/")
        for part in parts:
            offset = int(part[3:-1].split(",")[0])
            idx = position + offset
            if idx < 0:
                context.append("<BEG>")
            elif idx >= len(sentence):
                context.append("<END>")
            else:
                context.append(sentence[idx])
        return context

    def extract_features(self, sentence, position, prev_tag, cur_tag):
        """根据特征模板提取特征"""
        features = []
        for template in self.feature_templates:
            # 处理Unigram特征
            if template.startswith("U"):
                context = self.extract_features_helper(sentence, position, template)
                features.append(f"{cur_tag}::{template}:{'/'.join(context)}")

            # 处理Bigram特征
            elif template.startswith("B"):
                if prev_tag is not None:
                    context = self.extract_features_helper(sentence, position, template)
                    features.append(f"{prev_tag}→{cur_tag}::{template}:{'/'.join(context)}")
        return features

    def forward_backward(self, sentence):
        """前向后向算法"""
        T = len(sentence)
        N = len(self.tags)

        # 初始化
        alpha = np.zeros((T, N))
        beta = np.zeros((T, N))

        # 前向算法
        # alpha_t(j)=\sum_{i=1}^N alpha_{t-1}(i) * T(i,j) * E_t(j)
        for t in range(T):
            for j in range(N):
                if t == 0:
                    features = self.extract_features(sentence, t, None, self.idx_to_tag[j])
                    alpha[t][j] = sum(self.weights[f] for f in features)
                else:
                    log_probs = []
                    for i in range(N):
                        trans = self.transition[i][j]  # T(i,j)
                        features = self.extract_features(sentence, t, self.idx_to_tag[i], self.idx_to_tag[j])
                        emit = sum(self.weights[f] for f in features)  # E_t(j)
                        log_probs.append(alpha[t - 1][i] + trans + emit)
                    alpha[t][j] = np.logaddexp.reduce(log_probs) if log_probs else -np.inf

        # 后向算法
        # beta_t(i)=\sum_{j=1}^N T(i,j) * E_{t+1}(j) * beta_{t+1}(j)
        for t in reversed(range(T)):
            for i in range(N):
                if t == T - 1:
                    beta[t][i] = 0
                else:
                    log_probs = []
                    for j in range(N):
                        trans = self.transition[i][j]  # T(i,j)
                        features = self.extract_features(sentence, t + 1, self.idx_to_tag[i], self.idx_to_tag[j])
                        emit = sum(self.weights[f] for f in features)  # E_{t+1}(j)
                        log_probs.append(trans + emit + beta[t + 1][j])
                    beta[t][i] = np.logaddexp.reduce(log_probs) if log_probs else -np.inf

        # 配分函数
        # Z=\sum_{j=1}^N alpha_T(j)
        log_Z = np.log(sum(np.exp(alpha[-1]))) if any(np.isfinite(alpha[-1])) else -np.inf
        return alpha, beta, log_Z

    def compute_gradient(self, sentence, true_tags):
        """计算梯度"""
        # 提取真实路径特征
        true_features = set()
        for t in range(len(sentence)):
            prev_tag = true_tags[t - 1] if t > 0 else None
            features = self.extract_features(sentence, t, prev_tag, true_tags[t])
            true_features.update(features)

        # 前向后向算法
        alpha, beta, log_Z = self.forward_backward(sentence)
        expected_features = defaultdict(float)

        # 计算特征期望
        # P(y_0=j|x)            = alpha_0(j) * E_0(j) / Z
        # P(y_{t-1}=i, y_t=j|x) = alpha_{t-1}(i) * T(i,j) * E_t(j) * beta_t(j) / Z
        for t in range(len(sentence)):
            for i in range(len(self.tags)):
                for j in range(len(self.tags)):
                    # 提取特征
                    cur_tag = self.idx_to_tag[j]
                    prev_tag = self.idx_to_tag[i] if t > 0 else None
                    features = self.extract_features(sentence, t, prev_tag, cur_tag)

                    # 计算概率
                    if t == 0:
                        prob = np.exp(alpha[t][j] + beta[t][j] - log_Z) if log_Z != -np.inf else 0
                    else:
                        trans_score = self.transition[i][j]
                        emit_score = sum(self.weights[f] for f in features)
                        prob = np.exp(alpha[t - 1][i] + trans_score + emit_score + beta[t][j] - log_Z) if log_Z != -np.inf else 0

                    # 累加特征期望
                    for f in features:
                        expected_features[f] += prob

        # 计算权重梯度
        # weight_grad[f] = true - expected
        weight_grad = defaultdict(float)
        for f in true_features:
            weight_grad[f] += 1
        for f in expected_features:
            weight_grad[f] -= expected_features[f]

        # 计算转移矩阵梯度
        # transition_grad[f] = true - expected
        transition_grad = np.zeros_like(self.transition)
        for t in range(1, len(sentence)):
            i = self.tag_to_idx[true_tags[t - 1]]
            j = self.tag_to_idx[true_tags[t]]
            transition_grad[i][j] += 1

            for i_model in range(len(self.tags)):
                for j_model in range(len(self.tags)):
                    features = self.extract_features(sentence, t, self.idx_to_tag[i_model], self.idx_to_tag[j_model])
                    prob = np.exp(alpha[t - 1][i_model] + self.transition[i_model][j_model] + sum(self.weights[f] for f in features) + beta[t][j_model] - log_Z) if log_Z != -np.inf else 0
                    transition_grad[i_model][j_model] -= prob

        return weight_grad, transition_grad, log_Z

    def train(self, sentences, true_tags_seq, batch_size, max_iter, learning_rate):
        for iteration in range(max_iter):
            total_loss = 0
            batch_indices = range(0, len(sentences), batch_size)

            for start_idx in batch_indices:
                end_idx = start_idx + batch_size
                batch_sentences = sentences[start_idx:end_idx]
                batch_tags = true_tags_seq[start_idx:end_idx]
                print(f"- 批次 {start_idx}-{end_idx}/{len(sentences)}", end="  ")

                # 初始化累积变量
                batch_weights_grad = defaultdict(float)
                batch_transition_grad = np.zeros_like(self.transition)
                batch_loss = 0.0

                # 计算批次内所有样本的梯度
                for sentence, tags in zip(batch_sentences, batch_tags):
                    # 计算单个样本的梯度
                    weights_grad, transition_grad, log_Z = self.compute_gradient(sentence, tags)

                    # 累积权重梯度
                    for f in weights_grad:
                        batch_weights_grad[f] += weights_grad[f]

                    # 累积转移矩阵梯度
                    batch_transition_grad += transition_grad

                    # 计算单个样本的损失
                    true_score = self._compute_single_score(sentence, tags)
                    batch_loss += log_Z - true_score

                # 计算批次平均梯度
                batch_size_actual = len(batch_sentences)
                for f in batch_weights_grad:
                    batch_weights_grad[f] /= batch_size_actual
                batch_transition_grad /= batch_size_actual
                batch_loss /= batch_size_actual

                # 使用平均梯度更新参数
                for f in batch_weights_grad:
                    self.weights[f] += learning_rate * batch_weights_grad[f]

                for i in range(len(self.tags)):
                    for j in range(len(self.tags)):
                        if self.legal_transition(i, j):
                            self.transition[i][j] += learning_rate * batch_transition_grad[i][j]

                loss = batch_loss * batch_size_actual
                print(f"Loss={loss/batch_size_actual:.2f}")
                total_loss += loss

            print(f"迭代次数 {iteration}, Loss={total_loss/len(sentences):.2f}")

    def _compute_single_score(self, sentence, tags):
        """计算单个样本的真实路径得分"""
        score = 0
        for t in range(len(sentence)):
            prev_tag = tags[t - 1] if t > 0 else None
            features = self.extract_features(sentence, t, prev_tag, tags[t])
            score += sum(self.weights[f] for f in features)
            if t > 0:
                i = self.tag_to_idx[tags[t - 1]]
                j = self.tag_to_idx[tags[t]]
                score += self.transition[i][j]
        return score

    def viterbi_decode(self, sentence):
        T, N = len(sentence), len(self.tags)
        viterbi = np.full((T, N), -np.inf)  # 初始为负无穷
        backpointers = np.zeros((T, N), dtype=int)

        # 初始步：开头仅允许B-,S-,O
        for j in range(N):
            tag = self.idx_to_tag[j]
            if tag.startswith(("B-", "S-")) or tag == "O":
                features = self.extract_features(sentence, 0, None, tag)
                viterbi[0][j] = sum(self.weights[f] for f in features)

        # 递推步：仅允许合法转移
        for t in range(1, T):
            for j in range(N):
                max_score = -np.inf
                best_i = -1
                for i in range(N):
                    if not self.legal_transition(i, j):  # 跳过非法转移
                        continue
                    score = viterbi[t - 1][i] + self.transition[i][j]
                    features = self.extract_features(sentence, t, self.idx_to_tag[i], self.idx_to_tag[j])
                    score += sum(self.weights[f] for f in features)
                    if score > max_score:
                        max_score = score
                        best_i = i
                if best_i != -1:  # 确保存在合法前驱
                    viterbi[t][j] = max_score
                    backpointers[t][j] = best_i

        # 回溯
        best_path = [np.argmax(viterbi[-1])]
        for t in reversed(range(1, T)):
            best_path.append(backpointers[t][best_path[-1]])
        best_path.reverse()

        return [self.idx_to_tag[i] for i in best_path]

In [3]:
# 读取训练集
def train_dataset(train_file):
    train_sentences, train_tags = [], []
    with open(train_file, "r", encoding="utf-8") as f:
        cur_sentence = []
        for line in f:
            line = line.strip()
            if not line:
                if cur_sentence:
                    train_sentences.append([word for word, _ in cur_sentence])
                    train_tags.append([tag for _, tag in cur_sentence])
                    cur_sentence = []
            else:
                parts = line.split()
                cur_sentence.append((parts[0], parts[1]))

    return train_sentences, train_tags

In [4]:
def process_validation_file(input_file, output_file, crf):
    current_sentence = []
    with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
        for line in fin:
            if line.strip() == "":
                # 处理一个完整句子
                if current_sentence:
                    words = [word.lower() for word, _ in current_sentence]
                    predicted_tags = crf.viterbi_decode(words)
                    for (word, _), tag in zip(current_sentence, predicted_tags):
                        fout.write(f"{word} {tag}\n")
                    fout.write("\n")
                    current_sentence = []
            else:
                # 非空行，读取单词
                parts = line.split()
                word = parts[0]
                current_sentence.append((word, None))

# Chinese

In [None]:
# 重新初始化CRF模型
# tags = ["O", "B-NAME", "M-NAME", "E-NAME", "S-NAME", "B-CONT", "M-CONT", "E-CONT", "S-CONT", "B-EDU", "M-EDU", "E-EDU", "S-EDU", "B-TITLE", "M-TITLE", "E-TITLE", "S-TITLE", "B-ORG", "M-ORG", "E-ORG", "S-ORG", "B-RACE", "M-RACE", "E-RACE", "S-RACE", "B-PRO", "M-PRO", "E-PRO", "S-PRO", "B-LOC", "M-LOC", "E-LOC", "S-LOC"]
# feature_templates = [
#     "U01:%x[-1,0]",  # 前一个词
#     "U02:%x[0,0]",  # 当前词
# ]
# crf = CRF(tags, feature_templates)

In [18]:
# 从文件加载CRF模型
with open("crf_Chinese.pkl", "rb") as f:
    crf = dill.load(f)

In [None]:
# 读取中文训练集
train_sentences, train_tags = train_dataset("Chinese/train_cut.txt")
# train_sentences, train_tags = train_dataset("Chinese/train.txt")

In [7]:
crf.train(train_sentences, train_tags, batch_size=16, max_iter=1, learning_rate=0.1)

- 批次 0-16/3820  Loss=14.17
- 批次 16-32/3820  Loss=11.43
- 批次 32-48/3820  Loss=16.58
- 批次 48-64/3820  Loss=12.62
- 批次 64-80/3820  Loss=13.77
- 批次 80-96/3820  

  log_Z = np.log(sum(np.exp(alpha[-1]))) if any(np.isfinite(alpha[-1])) else -np.inf


Loss=inf
- 批次 96-112/3820  Loss=15.71
- 批次 112-128/3820  Loss=23.76
- 批次 128-144/3820  Loss=27.57
- 批次 144-160/3820  Loss=25.44
- 批次 160-176/3820  Loss=18.39
- 批次 176-192/3820  Loss=14.77
- 批次 192-208/3820  Loss=18.83
- 批次 208-224/3820  Loss=19.18
- 批次 224-240/3820  Loss=15.80
- 批次 240-256/3820  Loss=21.92
- 批次 256-272/3820  Loss=21.69
- 批次 272-288/3820  Loss=18.95
- 批次 288-304/3820  Loss=23.98
- 批次 304-320/3820  Loss=13.92
- 批次 320-336/3820  Loss=inf
- 批次 336-352/3820  Loss=27.40
- 批次 352-368/3820  Loss=29.62
- 批次 368-384/3820  Loss=24.77
- 批次 384-400/3820  Loss=17.87
- 批次 400-416/3820  Loss=22.94
- 批次 416-432/3820  Loss=13.38
- 批次 432-448/3820  Loss=13.61
- 批次 448-464/3820  Loss=18.04
- 批次 464-480/3820  Loss=21.27
- 批次 480-496/3820  Loss=inf
- 批次 496-512/3820  Loss=15.02
- 批次 512-528/3820  Loss=19.92
- 批次 528-544/3820  Loss=14.23
- 批次 544-560/3820  Loss=13.87
- 批次 560-576/3820  Loss=15.60
- 批次 576-592/3820  Loss=10.02
- 批次 592-608/3820  Loss=11.21
- 批次 608-624/3820  Loss=9.01
- 批次 62

In [8]:
# 将CRF模型保存到文件
dill.dump(crf, open("crf_Chinese.pkl", "wb"))

In [19]:
# 测试中文模型
# process_validation_file("Chinese/validation.txt", "Chinese/validation_CRF.txt", crf)
process_validation_file("Chinese/chinese_test.txt", "Chinese/chinese_test_CRF.txt", crf)

# English

In [None]:
# 初始化CRF模型
tags = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
feature_templates = [
    "U01:%x[-1,0]",  # 前一个词
    "U02:%x[0,0]",  # 当前词
]
crf = CRF(tags, feature_templates)

In [21]:
with open("crf_English.pkl", "rb") as f:
    crf = dill.load(f)

In [None]:
# 读取英文训练集
train_sentences, train_tags = train_dataset("English/train.txt")

In [14]:
crf.train(train_sentences, train_tags, batch_size=16, max_iter=5, learning_rate=0.2)


- 批次 0-16/14041  Loss=33.49
- 批次 16-32/14041  Loss=10.49
- 批次 32-48/14041  Loss=12.77
- 批次 48-64/14041  Loss=9.98
- 批次 64-80/14041  Loss=13.26
- 批次 80-96/14041  Loss=13.00
- 批次 96-112/14041  Loss=9.48
- 批次 112-128/14041  Loss=10.68
- 批次 128-144/14041  Loss=8.87
- 批次 144-160/14041  Loss=9.82
- 批次 160-176/14041  Loss=12.36
- 批次 176-192/14041  Loss=13.24
- 批次 192-208/14041  Loss=9.11
- 批次 208-224/14041  Loss=11.60
- 批次 224-240/14041  Loss=11.35
- 批次 240-256/14041  Loss=10.10
- 批次 256-272/14041  Loss=6.54
- 批次 272-288/14041  Loss=10.35
- 批次 288-304/14041  Loss=11.36
- 批次 304-320/14041  Loss=7.51
- 批次 320-336/14041  Loss=8.78
- 批次 336-352/14041  Loss=12.07
- 批次 352-368/14041  Loss=11.75
- 批次 368-384/14041  Loss=10.51
- 批次 384-400/14041  Loss=14.10
- 批次 400-416/14041  Loss=10.90
- 批次 416-432/14041  Loss=9.62
- 批次 432-448/14041  Loss=11.12
- 批次 448-464/14041  Loss=15.21
- 批次 464-480/14041  Loss=12.16
- 批次 480-496/14041  Loss=7.59
- 批次 496-512/14041  Loss=13.63
- 批次 512-528/14041  Loss=7.36
- 

KeyboardInterrupt: 

In [15]:
# 将CRF模型保存到文件
dill.dump(crf, open("crf_English.pkl", "wb"))

In [22]:
# 测试英文模型
# process_validation_file("English/validation.txt", "English/validation_CRF.txt", crf)
process_validation_file("English/english_test.txt", "English/english_test_CRF.txt", crf)