# Project 3 BiLSTM-based NER

## Install the package

In [6]:
!pip install -r requirements.txt



## Data

In [3]:
from torch.utils.data import Dataset


class CHisIECDataset(Dataset):
    label_label_id_mapping = {
        "O": 0,
        "B-PER": 1,
        "I-PER": 2,
        "E-PER": 3,
        "S-PER": 4,
        "B-LOC": 5,
        "I-LOC": 6,
        "E-LOC": 7,
        "S-LOC": 8,
        "B-OFI": 9,
        "I-OFI": 10,
        "E-OFI": 11,
        "S-OFI": 12,
        "B-BOOK": 13,
        "I-BOOK": 14,
        "E-BOOK": 15,
        "S-BOOK": 16,
    }

    def __init__(self, path) -> None:
        super().__init__()
        self.data = []
        with open(path, "r", encoding="utf-8") as f:
            d = [[], []]
            while line := f.readline():
                line = line.strip()
                if line:
                    word, label = line.split()
                    d[0].append(word)
                    d[1].append(self.label_label_id_mapping[label])
                elif d[0]:
                    self.data.append(tuple(d))
                    d = [[], []]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [4]:
train_set = CHisIECDataset("./CHisIEC/train.txt")
dev_set = CHisIECDataset("./CHisIEC/dev.txt")
test_set = CHisIECDataset("./CHisIEC/test.txt")

### 构建词汇表

通过遍历整个训练数据集，构建词汇表（`vocab`），将每个出现的字符分配一个唯一的整数索引，同时为 `<PAD>` 和 `<UNK>`（未知字符）保留特殊的索引。

- `<PAD>`：用于补齐序列长度不足的字符
- `<UNK>`：表示词汇表中未出现的字符
- 词汇表由训练集中的所有字符构成，测试集中出现但未在训练集中出现的字符会被映射为 <UNK>。

In [16]:
from collections import defaultdict


def build_vocab_from_dataset(dataset):
    '''
    @description: 从数据集中构建词汇表
    
    @param dataset: 数据集
    @return: 词汇表和反向查找表
    '''
    char_freq = defaultdict(int)

    for i in range(len(dataset)):
        sentence, _ = dataset[i]
        for char in sentence:
            char_freq[char] += 1
    
    # 构建词汇表
    vocab = {
        '<PAD>': 0,   # 填充符
        '<UNK>': 1    # 未出现的词
    }
    for char in char_freq:
        vocab[char] = len(vocab)  # 为每个字符分配唯一索引
    
    # 反向查找表
    index_to_char = {idx: char for char, idx in vocab.items()}
    
    return vocab, index_to_char

# 从训练集构建词汇表
vocab, index_to_char = build_vocab_from_dataset(train_set)

# 输出词汇表大小
print("词汇表大小:", len(vocab))
print("词汇表内容:", vocab)

词汇表大小: 3474
词汇表内容: {'<PAD>': 0, '<UNK>': 1, '子': 2, '璋': 3, '本': 4, '名': 5, '胡': 6, '麻': 7, '愈': 8, '，': 9, '多': 10, '勇': 11, '略': 12, '通': 13, '女': 14, '直': 15, '、': 16, '契': 17, '丹': 18, '汉': 19, '字': 20, '。': 21, '年': 22, '十': 23, '八': 24, '左': 25, '副': 26, '元': 27, '帅': 28, '撒': 29, '离': 30, '喝': 31, '引': 32, '在': 33, '麾': 34, '下': 35, '以': 36, '事': 37, '如': 38, '京': 39, '师': 40, '见': 41, '梁': 42, '王': 43, '宗': 44, '弼': 45, '与': 46, '语': 47, '悦': 48, '之': 49, '震': 50, '幼': 51, '兄': 52, '显': 53, '祖': 54, '代': 55, '二': 56, '四': 57, '从': 58, '征': 59, '交': 60, '趾': 61, '陷': 62, '没': 63, '袭': 64, '职': 65, '授': 66, '金': 67, '符': 68, '昭': 69, '信': 70, '校': 71, '尉': 72, '管': 73, '军': 74, '上': 75, '千': 76, '户': 77, '延': 78, '祐': 79, '覃': 80, '恩': 81, '加': 82, '武': 83, '将': 84, '寻': 85, '进': 86, '阶': 87, '德': 88, '中': 89, '统': 90, '苴': 91, '日': 92, '入': 93, '觐': 94, '世': 95, '复': 96, '赐': 97, '虎': 98, '诏': 99, '领': 100, '大': 101, '理': 102, '善': 103, '阐': 104, '威': 105, '楚': 106, '失': 107, '会

In [22]:
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import one_hot


def get_dataloader(dataset, shuffle=True):
    def collect_fn(batch):
        # t = batch[0][0]
        t = [vocab.get(char, vocab['<UNK>']) for char in batch[0][0]]  # 字符索引化，未出现的字符用'<UNK>'表示
        t = torch.tensor(t, dtype=torch.long) 
        l = one_hot(torch.tensor(batch[0][1], dtype=torch.int64), 17).float()
        return t, l

    return DataLoader(
        dataset,
        shuffle=shuffle,
        batch_size=1,
        collate_fn=collect_fn,
    )

train_loader = get_dataloader(train_set)
val_loader = get_dataloader(dev_set, shuffle=False)
test_loader = get_dataloader(test_set, shuffle=False)

## Model

### 1. 原模型

In [23]:
import torchtext
from torch import nn
from torchtext.vocab import Vectors
from torch.nn import LSTM

torchtext.disable_torchtext_deprecation_warning()


class MyAwesomeModel(nn.Module):

    def __init__(self, embed_dim=50, hidden_dim=50) -> None:
        super().__init__()
        self.vectors = Vectors(
            name="gigaword_chn.all.a2b.uni.ite50.vec",
            cache=".",
        )
        self.lstm = LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )
        self.classifier = nn.Linear(hidden_dim * 2, 17)

    def forward(self, x: str):
        x = self.vectors.get_vecs_by_tokens(x)
        x, _ = self.lstm(x.unsqueeze(0))
        x = self.classifier(x[0])
        return x

### 2. BiLSTM

- 嵌入层 (Embedding)：将字符索引映射为向量表示。
- BiLSTM 层：双向 LSTM 层用于捕捉字符序列的上下文信息。
- 全连接层 (Fully Connected Layer)：将 LSTM 输出映射为实体类别。

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim

class BiLSTMNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, padding_idx, unk_idx):
        super(BiLSTMNER, self).__init__()
        
        # Embedding层
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        
        # BiLSTM层
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        
        # 输出层
        self.fc = nn.Linear(hidden_dim * 2, label_size)  # 2 for bidirectional

    def forward(self, x):
        '''
        @param x: 输入的句子，形状为(batch_size, seq_len)

        @return logits: 模型的预测结果，形状为(batch_size, seq_len, label_size)
        '''
        # 检查输入形状
        # print("输入形状:", x.shape)

        # 通过Embedding层
        x = self.embedding(x)
        
        # 检查Embedding层后的形状
        # print("Embedding后形状:", x.shape)

        # 通过BiLSTM层
        lstm_out, _ = self.bilstm(x)
        
        # 检查LSTM输出形状
        # print("LSTM输出形状:", lstm_out.shape)

        # 通过全连接层得到预测
        logits = self.fc(lstm_out)
        return logits


# 模型参数
embedding_dim = 128         # 词嵌入维度
hidden_dim = 256            # BiLSTM隐藏层维度
vocab_size = len(vocab)     # 词汇表大小
label_size = len(CHisIECDataset.label_label_id_mapping)     # 标签种类数
padding_idx = vocab['<PAD>']    # 填充符索引
unk_idx = vocab['<UNK>']        # 未知符索引


## Training

In [29]:
from torch.optim import Adam
from sklearn.metrics import accuracy_score, f1_score

# model = MyAwesomeModel()      # 原模型
model = BiLSTMNER(vocab_size, embedding_dim, hidden_dim, label_size, padding_idx, unk_idx)
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()


def train(loader):
    model.train()
    epoch_loss = []
    for x, y in loader:
        # print("输入形状: ", x.shape)
        # print("标签形状: ", y.shape)
        
        optimizer.zero_grad()
        pred = model(x)
        label = y
        try:
            loss = loss_fn(pred, label)
        except:
            print(pred.shape, label.shape)
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    return {"loss": sum(epoch_loss) / len(epoch_loss)}


def eval(loader):
    model.eval()
    pred = []
    target = []
    for x, y in loader:
        _pred = model(x).argmax(-1)
        pred += _pred.tolist()
        _target = y.argmax(-1)
        target += _target.tolist()
    return {
        "accuracy": accuracy_score(target, pred),
        "f1_macro": f1_score(target, pred, average="macro"),
    }

In [30]:
from tqdm import trange

for epoch in trange(5, desc="Epoch"):
    metrics = train(train_loader)
    with torch.no_grad():
        metrics = {**eval(val_loader), **metrics}
    print(metrics)

Epoch:  20%|██        | 1/5 [01:50<07:22, 110.65s/it]

{'accuracy': 0.8983477809883678, 'f1_macro': 0.5499205866880839, 'loss': 0.5821468562133247}


Epoch:  40%|████      | 2/5 [03:44<05:38, 112.72s/it]

{'accuracy': 0.9115358968659703, 'f1_macro': 0.6362621439309182, 'loss': 0.23235830961003429}


Epoch:  60%|██████    | 3/5 [05:38<03:45, 112.94s/it]

{'accuracy': 0.922427206045788, 'f1_macro': 0.6566692502614248, 'loss': 0.10816675712286766}


Epoch:  80%|████████  | 4/5 [07:27<01:51, 111.57s/it]

{'accuracy': 0.9201303993480032, 'f1_macro': 0.6731631300760345, 'loss': 0.05387039655968974}


Epoch: 100%|██████████| 5/5 [09:22<00:00, 112.57s/it]

{'accuracy': 0.9199822182707268, 'f1_macro': 0.6697606041639133, 'loss': 0.03397334248934893}





## Evaluation

In [31]:
print(eval(test_loader))

{'accuracy': 0.9237534414193943, 'f1_macro': 0.6876196662034736}


## Conclusion

`BiLSTMNER` 模型的 F1 分数达到了 0.68，相比原来的 `MyAwesomeModel` 模型的 F1 分数 0.56 有了显著提升。性能提升的原因可能是：

- **上下文表示能力：** `BiLSTM` 模型能够有效捕捉输入序列中字符的前后文信息，特别是在中文中，字符的意义往往依赖于上下文。相比原有模型，BiLSTM 能够更好地理解字符之间的关系。
- **双向 LSTM：** `BiLSTM` 模型不仅从前向（过去到未来），还从后向（未来到过去）对序列进行处理，这有助于模型更加准确地分类实体，特别是对于需要依赖全局上下文的实体识别任务。
- **更复杂的架构：** 与原来的 `MyAwesomeModel` 相比，`BiLSTMNER` 具有更丰富的架构，能够更深入地捕捉序列模式，因此更适合处理像 NER 这样的任务，尤其是当任务依赖序列中字符的上下文时。