In [1]:
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)]
        return x

In [3]:
class TransformerCRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.tag_to_ix = tag_to_ix
        self.ix_to_tag = {v: k for k, v in tag_to_ix.items()}
        self.tagset_size = len(tag_to_ix)

        # Transformer部分
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)

        # 输出层
        self.hidden2tag = nn.Linear(d_model, self.tagset_size)

        # CRF参数
        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
        self.init_transitions()

    def forward(self, x, mask):
        embeds = self.embedding(x)  # (batch, seq_len, d_model)
        embeds = self.pos_encoder(embeds)
        transformer_out = self.transformer(embeds, src_key_padding_mask=~mask)
        return self.hidden2tag(transformer_out)

    def legal_transition(self, i, j):
        i_tag = self.ix_to_tag[i]
        j_tag = self.ix_to_tag[j]
        if i_tag == "O":
            # O后面只能跟O或B-
            if j_tag.startswith("M-") or j_tag.startswith("E-"):
                return False

        if i_tag.startswith("B-"):
            # B-后面只能跟M-或E-（相同类型）
            same_type = i_tag[2:] == j_tag[2:]
            if not ((j_tag.startswith("M-") or j_tag.startswith("E-")) and same_type):
                return False

        if i_tag.startswith("M-"):
            # M-后面只能跟M-或E-（相同类型）
            same_type = i_tag[2:] == j_tag[2:]
            if not ((j_tag.startswith("M-") or j_tag.startswith("E-")) and same_type):
                return False

        if i_tag.startswith("E-"):
            # E-后面只能跟O或B-
            if j_tag.startswith("M-") or j_tag.startswith("E-"):
                return False

        return True

    def init_transitions(self):
        # 随机初始化转移矩阵
        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))

        # 强制非法转移的分数为-10000
        for i in range(self.tagset_size):
            for j in range(self.tagset_size):
                if not self.legal_transition(i, j):
                    self.transitions.data[i, j] = -10000

        # 特殊标签约束
        self.transitions.data[self.tag_to_ix["<START>"], :] = -10000  # 只能从START开始
        self.transitions.data[:, self.tag_to_ix["<STOP>"]] = -10000  # 只能转移到STOP
        self.transitions.data[self.tag_to_ix["<PAD>"], :] = -10000  # PAD不能转移出去
        self.transitions.data[:, self.tag_to_ix["<PAD>"]] = -10000  # 不能转移到PAD

    def neg_log_likelihood(self, sentences, tags, masks):
        emissions = self.forward(sentences, masks)
        batch_size = sentences.size(0)

        # 计算真实路径分数
        score = torch.zeros(batch_size).to(sentences.device)
        for i in range(batch_size):
            # 获取有效长度
            length = masks[i].sum().item()
            if length == 0:
                continue

            # 添加START和第一个标签的转移
            score[i] = self.transitions[self.tag_to_ix["<START>"], tags[i, 0]]
            score[i] += emissions[i, 0, tags[i, 0]]

            # 累加中间转移和发射分数
            for t in range(1, length):
                score[i] += self.transitions[tags[i, t - 1], tags[i, t]] + emissions[i, t, tags[i, t]]

            # 添加最后一个标签到STOP的转移
            score[i] += self.transitions[tags[i, length - 1], self.tag_to_ix["<STOP>"]]

        # 计算配分函数
        log_Z = self._compute_log_partition(emissions, masks)

        return (log_Z - score).mean()

    def _compute_log_partition(self, emissions, masks):
        batch_size, seq_len, _ = emissions.shape
        device = emissions.device

        # 初始化alpha
        alpha = torch.full((batch_size, self.tagset_size), -10000.0).to(device)
        alpha[:, self.tag_to_ix["<START>"]] = 0.0

        for t in range(seq_len):
            # 获取当前时间步的mask
            mask_t = masks[:, t].unsqueeze(1)  # (batch, 1)
            current_emissions = emissions[:, t]  # (batch, tag_size)

            # 计算alpha[t] = logsumexp(alpha[t-1] + transitions + emissions[t])
            alpha_expanded = alpha.unsqueeze(2)  # (batch, tag_size, 1)
            trans_expanded = self.transitions.unsqueeze(0)  # (1, tag_size, tag_size)

            log_prob = alpha_expanded + trans_expanded + current_emissions.unsqueeze(1)
            new_alpha = torch.logsumexp(log_prob, dim=1)

            # 更新alpha，只更新非padding位置
            alpha = torch.where(mask_t, new_alpha, alpha)

        # 最后加上到STOP的转移
        alpha += self.transitions[:, self.tag_to_ix["<STOP>"]].unsqueeze(0)
        return torch.logsumexp(alpha, dim=1)

    def viterbi_decode(self, emissions, mask):
        """
        emissions: (batch_size, seq_len, tagset_size)
        mask: (batch_size, seq_len)
        """
        batch_size, seq_len, _ = emissions.shape
        device = emissions.device

        # 初始化viterbi变量和backpointers
        viterbi = torch.full((batch_size, self.tagset_size), -10000.0, device=device)
        viterbi[:, self.tag_to_ix["<START>"]] = 0.0
        backpointers = torch.zeros((batch_size, seq_len, self.tagset_size), dtype=torch.long, device=device)

        for t in range(seq_len):
            # 获取当前时间步的mask
            mask_t = mask[:, t].unsqueeze(1)  # (batch_size, 1)

            # 计算所有路径分数
            scores = viterbi.unsqueeze(2) + self.transitions.unsqueeze(0)  # (batch_size, tag_size, tag_size)
            scores += emissions[:, t].unsqueeze(1)  # 广播发射分数

            # 找到最佳路径
            best_scores, best_tags = torch.max(scores, dim=1)

            # 更新viterbi和backpointers
            viterbi = best_scores * mask_t + viterbi * (~mask_t)  # 仅更新非padding位置
            backpointers[:, t] = best_tags

        # 添加STOP转移
        scores = viterbi + self.transitions[:, self.tag_to_ix["<STOP>"]].unsqueeze(0)
        _, best_tags = torch.max(scores, dim=1)

        # 回溯路径
        best_paths = []
        for i in range(batch_size):
            path = [best_tags[i].item()]
            for t in reversed(range(seq_len)):
                if not mask[i, t]:
                    continue  # 跳过padding位置
                path.append(backpointers[i, t, path[-1]].item())
            path.reverse()
            best_paths.append(path[1:])  # 去除START标签

        return best_paths

In [4]:
def train_model(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        sentences, tags, masks = batch
        sentences, tags, masks = sentences.to(device), tags.to(device), masks.to(device)
        
        optimizer.zero_grad()
        loss = model.neg_log_likelihood(sentences, tags, masks)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [5]:
def predict(model, sentences, masks, device):
    model.eval()
    with torch.no_grad():
        emissions = model(sentences.to(device), masks.to(device))
        return model.viterbi_decode(emissions, masks.to(device))

In [6]:
# 读取训练集
def train_dataset(train_file):
    train_sentences, train_tags = [], []
    with open(train_file, "r", encoding="utf-8") as f:
        cur_sentence = []
        for line in f:
            line = line.strip()
            if not line:
                if cur_sentence:
                    train_sentences.append([word for word, _ in cur_sentence])
                    train_tags.append([tag for _, tag in cur_sentence])
                    cur_sentence = []
            else:
                parts = line.split()
                cur_sentence.append((parts[0], parts[1]))

    return train_sentences, train_tags

In [7]:
# 序列准备函数
def prepare_sequence(seq, to_ix, is_tags=False):
    if is_tags:
        return torch.tensor([to_ix[w] for w in seq], dtype=torch.long)
    else:
        return torch.tensor([to_ix.get(w, to_ix["<UNK>"]) for w in seq], dtype=torch.long)

# 数据集类
class NERDataset(Dataset):
    def __init__(self, sentences, tags, vocab, tag_to_ix):
        self.sentences = [prepare_sequence(s, vocab) for s in sentences]
        self.tags = [prepare_sequence(t, tag_to_ix, is_tags=True) for t in tags]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

In [8]:
def process_validation_file(input_file, output_file, model, vocab, tag_to_ix, device="cpu"):
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}  # 创建索引到标签的映射

    with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
        current_sentence = []

        for line in fin:
            line = line.strip()
            if not line:
                # 处理完整句子
                if current_sentence:
                    # 准备输入数据
                    words = [word for word, _ in current_sentence]
                    word_ids = [vocab.get(word, vocab["<UNK>"]) for word in words]

                    # 转换为模型输入格式
                    input_seq = torch.tensor([word_ids], dtype=torch.long).to(device)
                    input_mask = torch.ones_like(input_seq).bool().to(device)

                    # 预测标签
                    with torch.no_grad():
                        emissions = model(input_seq, input_mask)
                        predicted_tag_ids = model.viterbi_decode(emissions, input_mask)[0]

                    # 转换索引为标签
                    predicted_tags = [ix_to_tag[tag_id] for tag_id in predicted_tag_ids]

                    # 写入结果
                    for (word, _), tag in zip(current_sentence, predicted_tags):
                        fout.write(f"{word} {tag}\n")
                    fout.write("\n")

                    current_sentence = []
            else:
                # 处理非空行
                parts = line.split()
                word = parts[0]
                current_sentence.append((word, None))

# Chinese

In [9]:
train_sentences, train_tags = train_dataset("Chinese/train_cut.txt")

# 构建中文词汇表和标签表
vocab = {"<PAD>": 0, "<UNK>": 1}
for sent in train_sentences:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)

tag_to_ix = {"<PAD>": 0, "<START>": 1, "<STOP>": 2}
for tag_seq in train_tags:
    for tag in tag_seq:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)


def collate_fn(batch):
    sentences, tags = zip(*batch)
    lengths = torch.tensor([len(s) for s in sentences])
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=vocab["<PAD>"])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag_to_ix["<PAD>"])
    masks = sentences_padded != vocab["<PAD>"]
    return sentences_padded, tags_padded, masks, lengths


dataset = NERDataset(train_sentences, train_tags, vocab, tag_to_ix)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [10]:
model = TransformerCRF(vocab_size=len(vocab), tag_to_ix=tag_to_ix, d_model=256, nhead=8, num_layers=2).to(device)

In [11]:
# 从文件加载模型参数
model.load_state_dict(torch.load("crf_tf_Chinese.pth"))

<All keys matched successfully>

In [None]:
# 训练参数
n_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# 训练循环
for epoch in range(n_epochs):
    total_loss = 0
    for train_sentences, train_tags, masks, _ in dataloader:
        train_sentences = train_sentences.to(device)
        train_tags = train_tags.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        loss = model.neg_log_likelihood(train_sentences, train_tags, masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1, Loss: 21.6030
Epoch 2, Loss: 10.6194
Epoch 3, Loss: 7.8497
Epoch 4, Loss: 6.4398
Epoch 5, Loss: 4.9064
Epoch 6, Loss: 3.9778
Epoch 7, Loss: 3.5068
Epoch 8, Loss: 3.2058
Epoch 9, Loss: 2.9666
Epoch 10, Loss: 2.8307
测试句子: ['张', '建', '民', '是', '上', '海', '银', '行', '的', '总', '裁', '和', '工', '程', '师']
预测标签: ['O', 'O', 'O', 'O', 'B-ORG', 'M-ORG', 'M-ORG', 'E-ORG', 'B-TITLE', 'M-TITLE', 'E-TITLE', 'O', 'B-TITLE', 'M-TITLE', 'E-TITLE']
张: O
建: O
民: O
是: O
上: B-ORG
海: M-ORG
银: M-ORG
行: E-ORG
的: B-TITLE
总: M-TITLE
裁: E-TITLE
和: O
工: B-TITLE
程: M-TITLE
师: E-TITLE


In [14]:
# 将模型保存到文件
torch.save(model.state_dict(), "crf_tf_Chinese.pth")

In [12]:
# 测试中文模型
# process_validation_file("Chinese/validation.txt", "Chinese/validation_CRF_TF.txt", model, vocab, tag_to_ix, device)
process_validation_file("Chinese/chinese_test.txt", "Chinese/chinese_test_CRF_TF.txt", model, vocab, tag_to_ix, device)

# English

In [9]:
train_sentences, train_tags = train_dataset("English/train_cut.txt")

# 构建中文词汇表和标签表
vocab = {"<PAD>": 0, "<UNK>": 1}
for sent in train_sentences:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)

tag_to_ix = {"<PAD>": 0, "<START>": 1, "<STOP>": 2}
for tag_seq in train_tags:
    for tag in tag_seq:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)


def collate_fn(batch):
    sentences, tags = zip(*batch)
    lengths = torch.tensor([len(s) for s in sentences])
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=vocab["<PAD>"])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag_to_ix["<PAD>"])
    masks = sentences_padded != vocab["<PAD>"]
    return sentences_padded, tags_padded, masks, lengths


dataset = NERDataset(train_sentences, train_tags, vocab, tag_to_ix)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [10]:
model = TransformerCRF(vocab_size=len(vocab), tag_to_ix=tag_to_ix, d_model=256, nhead=8, num_layers=2).to(device)

In [None]:
# 从文件加载模型参数
model.load_state_dict(torch.load("crf_tf_English.pth"))

<All keys matched successfully>

In [None]:
# 训练参数
n_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 训练循环
for epoch in range(n_epochs):
    total_loss = 0
    for train_sentences, train_tags, masks, _ in dataloader:
        train_sentences = train_sentences.to(device)
        train_tags = train_tags.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        loss = model.neg_log_likelihood(train_sentences, train_tags, masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1, Loss: 14997.9312
Epoch 2, Loss: 12488.0388
Epoch 3, Loss: 10970.1028
Epoch 4, Loss: 10595.4564
Epoch 5, Loss: 10478.6881
Epoch 6, Loss: 10647.0677
Epoch 7, Loss: 10669.6169
Epoch 8, Loss: 10518.8772
Epoch 9, Loss: 10486.9362
Epoch 10, Loss: 10425.5425


In [12]:
# 将模型保存到文件
torch.save(model.state_dict(), "crf_tf_English.pth")

In [14]:
# 测试英文模型
# process_validation_file("English/validation.txt", "English/validation_CRF_TF.txt", model, vocab, tag_to_ix, device)
process_validation_file("English/english_test.txt", "English/english_test_CRF_TF.txt", model, vocab, tag_to_ix, device)