In [3]:
from pprint import pprint
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from utils import Indexer, read_ner_data_from_connl, load_embedding_dict
from metrics import MetricsHandler

In [4]:
# 假设 read_ner_data_from_connl 返回展平的单词和标签列表
# 我们需要手动将它们按句子分组
def group_into_sentences(words, tags):
    """
    将展平的单词和标签列表按句子分组
    假设空行（或特殊标记）分隔句子，但这里我们需要原始文件来准确分组
    这里假设 words 和 tags 是按顺序对应且长度相等的
    """
    sentences = []
    tag_sentences = []
    current_sentence = []
    current_tags = []
    
    # 简单假设：如果遇到特殊标记（例如 'DOCSTART' 或空行），则分隔句子
    # 注意：实际 CoNLL 文件需要根据空行分隔，这里仅为示例
    for word, tag in zip(words, tags):
        if word == '-DOCSTART-':  # CoNLL 文件中的文档分隔符
            if current_sentence:  # 如果当前句子不为空，保存
                sentences.append(current_sentence)
                tag_sentences.append(current_tags)
                current_sentence = []
                current_tags = []
        else:
            current_sentence.append(word.lower())  # 转换为小写
            current_tags.append(tag)
    
    # 添加最后一个句子
    if current_sentence:
        sentences.append(current_sentence)
        tag_sentences.append(current_tags)
    
    return sentences, tag_sentences

In [5]:
# 读取数据
train_words, train_tags = read_ner_data_from_connl('data/train.txt')
dev_words, dev_tags = read_ner_data_from_connl('data/dev.txt')
test_words, test_tags = read_ner_data_from_connl('data/test.txt')

# 按句子分组
train_sentences, train_tag_sentences = group_into_sentences(train_words, train_tags)
dev_sentences, dev_tag_sentences = group_into_sentences(dev_words, dev_tags)
test_sentences, test_tag_sentences = group_into_sentences(test_words, test_tags)

# 创建词汇表（基于所有单词）
all_train_words = [word for sentence in train_sentences for word in sentence]
word_indexer = Indexer(all_train_words)
tag_indexer = Indexer([tag for tags in train_tag_sentences for tag in tags])

In [6]:
# 数据集类（处理句子级别数据）
class NERDataset:
    def __init__(self, sentences, tag_sentences, word_indexer, tag_indexer):
        self.sentences = sentences  # 句子列表
        self.tag_sentences = tag_sentences  # 标签序列列表
        self.word_indexer = word_indexer
        self.tag_indexer = tag_indexer
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.tag_sentences[idx]
        word_indices = [self.word_indexer.element_to_index(word) for word in words]
        tag_indices = [self.tag_indexer.element_to_index(tag) for tag in tags]
        mask = [1] * len(words)  # 掩码，表示有效位置
        return word_indices, tag_indices, mask

# 数据加载器（返回批次化的句子数据）
def get_batch(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        batch_samples = [dataset[j] for j in range(i, min(i + batch_size, len(dataset)))]
        batch_words, batch_tags, batch_masks = zip(*batch_samples)
        
        # 填充到最大长度
        max_len = max(len(seq) for seq in batch_words)
        word_indices = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in batch_words], dtype=torch.long)
        tag_indices = torch.tensor([seq + [0] * (max_len - len(seq)) for seq in batch_tags], dtype=torch.long)
        masks = torch.tensor([m + [0] * (max_len - len(m)) for m in batch_masks], dtype=torch.float)
        
        yield word_indices, tag_indices, masks

In [7]:
# 创建数据集和加载器
batch_size = 128
train_dataset = NERDataset(train_sentences, train_tag_sentences, word_indexer, tag_indexer)
dev_dataset = NERDataset(dev_sentences, dev_tag_sentences, word_indexer, tag_indexer)
test_dataset = NERDataset(test_sentences, test_tag_sentences, word_indexer, tag_indexer)

train_loader = list(get_batch(train_dataset, batch_size))
dev_loader = list(get_batch(dev_dataset, batch_size))
test_loader = list(get_batch(test_dataset, batch_size))

# 加载 GloVe 嵌入
glove_path = "data/glove.6B.100d.txt"
embeddings_index = load_embedding_dict(glove_path)
embedding_dim = 100
embedding_matrix = np.zeros((len(word_indexer), embedding_dim))
for word, idx in word_indexer.get_element_to_index_dict().items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

100%|██████████| 400000/400000 [00:02<00:00, 149431.03it/s]


In [8]:
class CRF(nn.Module):
    def __init__(self, num_tags):
        super().__init__()
        self.num_tags = num_tags
        self.transition = nn.Parameter(torch.randn(num_tags, num_tags))
        # 可选：对填充标签的转移施加惩罚
        # self.transition.data[:, 0] = -10000  # 禁止转移到填充标签
        # self.transition.data[0, :] = -10000  # 禁止从填充标签转移出

    def forward(self, emissions, tags, mask=None):
        batch_size, seq_len, _ = emissions.shape

        # 计算真实路径的分数
        score = emissions[range(batch_size), 0, tags[:, 0]]  # 初始位置的发射分数
        for t in range(1, seq_len):
            score += self.transition[tags[:, t-1], tags[:, t]]  # 转移分数
            score += emissions[range(batch_size), t, tags[:, t]]  # 后续位置的发射分数

        # 计算所有路径的总分数（配分函数）
        alpha = emissions[:, 0]  # (batch_size, num_tags)
        for t in range(1, seq_len):
            alpha_t = alpha.unsqueeze(1) + self.transition  # (batch_size, num_tags, num_tags)
            alpha_t = torch.logsumexp(alpha_t, dim=2) + emissions[:, t]  # (batch_size, num_tags)
            if mask is not None:
                # 使用掩码处理填充位置
                alpha_t = alpha_t * mask[:, t].unsqueeze(1) + alpha * (1 - mask[:, t]).unsqueeze(1)
            alpha = alpha_t

        total_score = torch.logsumexp(alpha, dim=1)  # (batch_size,)

        # 返回负对数似然损失
        return (total_score - score).mean()

    def decode(self, emissions, mask=None):
        batch_size, seq_len, num_tags = emissions.shape
        viterbi = torch.zeros(batch_size, seq_len, num_tags, device=emissions.device)
        backpointers = torch.zeros(batch_size, seq_len, num_tags, dtype=torch.long, device=emissions.device)

        viterbi[:, 0, :] = emissions[:, 0, :]
        for t in range(1, seq_len):
            viterbi_t, indices = (viterbi[:, t-1, :].unsqueeze(2) + self.transition).max(dim=1)
            viterbi[:, t, :] = viterbi_t + emissions[:, t, :]
            backpointers[:, t, :] = indices

        best_path = []
        best_score, best_tag = viterbi[:, -1, :].max(dim=1)
        best_path.append(best_tag)
        for t in reversed(range(1, seq_len)):
            best_tag = backpointers[:, t, best_tag]
            best_path.append(best_tag)
        best_path = torch.stack(best_path[::-1], dim=1)
        return best_path

# BiLSTM + CRF 模型
class BiLSTM_CRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, tagset_size)
        self.crf = CRF(tagset_size)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        emissions = self.fc(lstm_out)
        if tags is not None:
            loss = self.crf(emissions, tags, mask)
            return loss
        else:
            return self.crf.decode(emissions, mask)

In [9]:
# 初始化模型
vocab_size = len(word_indexer)
tagset_size = len(tag_indexer)
hidden_dim = 256
model = BiLSTM_CRF_NER(vocab_size, embedding_dim, hidden_dim, tagset_size, embedding_matrix)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
# 训练函数
def train_model(model, train_loader, dev_loader, optimizer, num_epochs=10, device=None):
    model.train()

    dev_f1_scores = []
    labels_int = list(range(tagset_size))
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        train_metrics = MetricsHandler(labels_int)
        
        batch_num=1

        for batch in train_loader:
            inputs, targets, masks = [b.to(device) for b in batch]
            optimizer.zero_grad()
            loss = model(inputs, targets, masks)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            preds = model(inputs, mask=masks)
            train_metrics.update(preds.cpu().numpy().flatten(), targets.cpu().numpy().flatten())

            print(f"Batch: {batch_num}/{len(train_loader)} | Loss: {loss.item():.4f}")
            batch_num+=1
        train_f1 = train_metrics.get_metrics()["F1-score"][-1]
        dev_metrics = evaluate_model(model, dev_loader, device)
        dev_f1 = dev_metrics.get_metrics()["F1-score"][-1]
        dev_f1_scores.append(dev_f1)
        
        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {total_loss:.4f} | Train F1: {train_f1:.4f} | Dev F1: {dev_f1:.4f}")
    
    return model, dev_f1_scores

In [None]:
# 评估函数
def evaluate_model(model, data_loader, device):
    model.eval()
    labels_int = list(range(tagset_size))
    metrics = MetricsHandler(labels_int)
    
    with torch.no_grad():
        for batch in data_loader:
            inputs, targets, masks = [b.to(device) for b in batch]
            preds = model(inputs, mask=masks)
            metrics.update(preds.cpu().numpy().flatten(), targets.cpu().numpy().flatten())
    
    return metrics

: 

In [None]:
# 主程序部分
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

num_epochs = 10
print("开始训练...")

model, dev_f1_scores = train_model(
    model, train_loader, dev_loader, optimizer, num_epochs=num_epochs, device=device
)

model_path = "bilstm_crf_model.pth"
torch.save(model.state_dict(), model_path)
print(f"模型参数已保存到 {model_path}")

print("\n前5个周期的开发集F-1分数：")
for i, f1 in enumerate(dev_f1_scores[:5], 1):
    print(f"Epoch {i}: Dev F1 Score: {f1:.4f}")

print("\n开始测试...")
test_metrics = evaluate_model(model, test_loader, device)
test_f1 = test_metrics.get_metrics()["F1-score"][-1]
print(f"Test F1 Score: {test_f1:.4f}")

test_f1_greedy = 0.8245  # 请替换为实际值
print(f"\n性能比较：")
print(f"Test F1 Score (Greedy Search, Step 3): {test_f1_greedy:.4f}")
print(f"Test F1 Score (CRF with Viterbi): {test_f1:.4f}")
print(f"性能差异 (CRF - Greedy): {test_f1 - test_f1_greedy:.4f}")

开始训练...
