## CS310 Natural Language Processing
## Assignment 4. Long Short Term Memory (LSTM) Network for Named Entity Recognition (NER)
## CRF

### 0. Import Necessary Libraries

In [49]:
from pprint import pprint
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
from utils import Indexer, read_ner_data_from_connl, load_embedding_dict

In [50]:
train_words, train_tags = read_ner_data_from_connl('data/train.txt')
dev_words, dev_tags = read_ner_data_from_connl('data/dev.txt')
test_words, test_tags = read_ner_data_from_connl('data/test.txt')

train_words = [word.lower() for word in train_words]
dev_words = [word.lower() for word in dev_words]
test_words = [word.lower() for word in test_words]

In [51]:
word_indexer = Indexer(train_words)
tag_indexer = Indexer(train_tags)

In [52]:
class NERDataset:
    def __init__(self, words, tags, word_indexer, tag_indexer):
        self.words = words
        self.tags = tags
        self.word_indexer = word_indexer
        self.tag_indexer = tag_indexer
    
    def __len__(self):
        return len(self.words)
    
    def __getitem__(self, idx):
        word_idx = self.word_indexer.element_to_index(self.words[idx])
        tag_idx = self.tag_indexer.element_to_index(self.tags[idx])
        mask=1
        return word_idx, tag_idx,mask

In [53]:
train_dataset = NERDataset(train_words, train_tags, word_indexer, tag_indexer)
dev_dataset = NERDataset(dev_words, dev_tags, word_indexer, tag_indexer)
test_dataset = NERDataset(test_words, test_tags, word_indexer, tag_indexer)

In [54]:
# 调整数据加载器以返回mask
def get_batch(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        batch_words = dataset.words[i:i + batch_size]
        batch_tags = dataset.tags[i:i + batch_size]
        word_indices = [dataset.word_indexer.element_to_index(w) for w in batch_words]
        tag_indices = [dataset.tag_indexer.element_to_index(t) for t in batch_tags]
        masks = [1] * len(batch_words)  # 简化处理，实际需根据填充符生成
        yield (
            torch.tensor(word_indices, dtype=torch.long),
            torch.tensor(tag_indices, dtype=torch.long),
            torch.tensor(masks, dtype=torch.float)
)

In [55]:
batch_size = 128
train_loader = list(get_batch(train_dataset, batch_size))
dev_loader = list(get_batch(dev_dataset, batch_size))
test_loader = list(get_batch(test_dataset, batch_size))

In [56]:
glove_path = "data/glove.6B.100d.txt"
embeddings_index = load_embedding_dict(glove_path)
embedding_dim = 100
embedding_matrix = np.zeros((len(word_indexer), embedding_dim))
for word, idx in word_indexer.get_element_to_index_dict().items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

100%|██████████| 400000/400000 [00:02<00:00, 150813.74it/s]


### 1. Build the Model

In [57]:
class CRF(nn.Module):
    def __init__(self, num_tags):
        super().__init__()
        self.num_tags = num_tags
        # 转移矩阵：transition[i][j]表示从标签i转移到j的分数
        self.transition = nn.Parameter(torch.randn(num_tags, num_tags))
        self.transition.data[:, 0] = -10000  # 禁止转移到起始标签（假设标签0为填充符）
        self.transition.data[0, :] = -10000  # 禁止从起始标签转移出

    def forward(self, emissions, tags, mask=None):
        # emissions: (batch_size, seq_len, num_tags) 来自LSTM的输出
        # tags: (batch_size, seq_len) 真实标签
        # mask: (batch_size, seq_len) 表示有效位置（非填充）
        batch_size, seq_len, _ = emissions.shape
        
        # 计算正确路径的分数
        score = torch.gather(emissions, 2, tags.unsqueeze(-1)).squeeze(-1)  # 发射分数
        score += self.transition[tags[:, :-1], tags[:, 1:]]  # 转移分数
        
        # 计算所有路径的总分数（配分函数）
        alpha = emissions[:, 0]  # 初始状态
        for t in range(1, seq_len):
            alpha_t = alpha.unsqueeze(2) + self.transition.unsqueeze(0)  # (batch_size, num_tags, num_tags)
            alpha_t = torch.logsumexp(alpha_t, dim=1) + emissions[:, t]
            alpha = alpha_t * mask[:, t].unsqueeze(-1) + alpha * (1 - mask[:, t]).unsqueeze(-1)
        
        total_score = torch.logsumexp(alpha, dim=1)
        return (total_score - score.sum(dim=1)).mean()  # 负对数似然损失

    def decode(self, emissions, mask=None):
        # 使用Viterbi算法解码最优路径
        batch_size, seq_len, _ = emissions.shape
        viterbi = torch.zeros_like(emissions)
        backpointers = torch.zeros((batch_size, seq_len, self.num_tags), dtype=torch.long)
        
        # 初始化
        viterbi[:, 0] = emissions[:, 0]
        for t in range(1, seq_len):
            max_scores, indices = (viterbi[:, t-1].unsqueeze(-1) + self.transition).max(dim=1)
            viterbi[:, t] = max_scores + emissions[:, t]
            backpointers[:, t] = indices
        
        # 回溯最优路径
        best_path = []
        best_score, best_tag = viterbi[:, -1].max(dim=1)
        best_path.append(best_tag)
        for t in reversed(range(seq_len-1)):
            best_tag = backpointers[:, t+1].gather(1, best_tag.unsqueeze(-1)).squeeze(-1)
            best_path.append(best_tag)
        best_path = torch.stack(best_path[::-1], dim=1)
        return best_path

In [58]:
class BiLSTM_CRF_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, tagset_size)
        self.crf = CRF(tagset_size)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(x)  # (batch_size, seq_len, hidden_dim*2)
        emissions = self.fc(lstm_out)  # (batch_size, seq_len, tagset_size)
        
        if tags is not None:
            loss = self.crf(emissions, tags, mask)
            return loss
        else:
            return self.crf.decode(emissions, mask)

# 初始化模型
vocab_size = len(word_indexer)
tagset_size = len(tag_indexer)
hidden_dim = 256
model = BiLSTM_CRF_NER(vocab_size, embedding_dim, hidden_dim, tagset_size, embedding_matrix)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

### 2. Train and Evaluate

In [59]:
from utils import get_tag_indices_from_scores
from metrics import MetricsHandler
import torch.optim as optim

labels_str = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
labels_int = list(range(len(labels_str)))
train_metrics = MetricsHandler(labels_int)
dev_metrics = MetricsHandler(labels_int)
test_metrics = MetricsHandler(labels_int)

# 定义损失函数和优化器
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [60]:
# 训练函数
def train_model(model, train_loader, dev_loader, optimizer, num_epochs=10, device=None):
    """
    训练模型并在每个 epoch 后评估开发集
    """
    model.train()
    dev_f1_scores = []  # 存储每个 epoch 的开发集 F1 分数
    labels_int = list(range(tagset_size))  # 标签的整数表示
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        train_metrics = MetricsHandler(labels_int)  # 训练集指标
        
        batch_num=1
        for batch in train_loader:
            inputs, targets, masks = [b.to(device) for b in batch]
            optimizer.zero_grad()
            loss = model(inputs, targets, masks)  # 计算 CRF 损失
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            # 解码预测结果并更新指标
            preds = model(inputs, mask=masks)
            train_metrics.update(preds.cpu().numpy().flatten(), targets.cpu().numpy().flatten())
            print(f"Batch: {batch_num}/{len(train_loader)} | Loss: {loss.item():.4f}")
            batch_num+=1
        
        # 计算训练集 F1 分数
        train_f1 = train_metrics.get_metrics()["F1-score"][-1]
        
        # 评估开发集
        dev_metrics = evaluate_model(model, dev_loader, device)
        dev_f1 = dev_metrics.get_metrics()["F1-score"][-1]
        dev_f1_scores.append(dev_f1)
        
        # 打印当前 epoch 的结果
        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {total_loss:.4f} | Train F1: {train_f1:.4f} | Dev F1: {dev_f1:.4f}")
    
    return model, dev_f1_scores

In [61]:
# 评估函数
def evaluate_model(model, data_loader, device):
    """
    评估模型并返回指标
    """
    model.eval()
    labels_int = list(range(tagset_size))
    metrics = MetricsHandler(labels_int)
    
    with torch.no_grad():
        for batch in data_loader:
            inputs, targets, masks = [b.to(device) for b in batch]
            preds = model(inputs, mask=masks)  # 使用维特比解码
            metrics.update(preds.cpu().numpy().flatten(), targets.cpu().numpy().flatten())
    
    return metrics

In [62]:
# 主程序部分
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

num_epochs = 10
print("开始训练...")

# 训练模型并获取开发集 F-1 分数
model, dev_f1_scores = train_model(
    model, train_loader, dev_loader, optimizer, num_epochs=num_epochs, device=device
)

# 保存模型参数
model_path = "bilstm_crf_model.pth"
torch.save(model.state_dict(), model_path)
print(f"模型参数已保存到 {model_path}")

# 打印前 5 个周期的开发集 F-1 分数
print("\n前5个周期的开发集F-1分数：")
for i, f1 in enumerate(dev_f1_scores[:5], 1):
    print(f"Epoch {i}: Dev F1 Score: {f1:.4f}")

# 在测试集上评估
print("\n开始测试...")
test_metrics = evaluate_model(model, test_loader, device)
test_f1 = test_metrics.get_metrics()["F1-score"][-1]
print(f"Test F1 Score: {test_f1:.4f}")

# 与第 3 步的贪心搜索结果比较（假设第 3 步的 F1 分数已知）
test_f1_greedy = 0.8245  # 请替换为你在第 3 步中实际得到的 F1 分数
print(f"\n性能比较：")
print(f"Test F1 Score (Greedy Search, Step 3): {test_f1_greedy:.4f}")
print(f"Test F1 Score (CRF with Viterbi): {test_f1:.4f}")
print(f"性能差异 (CRF - Greedy): {test_f1 - test_f1_greedy:.4f}")

开始训练...


ValueError: not enough values to unpack (expected 3, got 2)

### 3. Other Experiments