# 序列标注任务
- 目标： 为文本的每一个token分配一个标签。
- 常见的序列标注任务有命名实体识别 NER (Named Entity Recognition) 和词性标注 POS (Part-Of-Speech tagging)。

```markdown
我们选择 1998 年人民日报语料库作为数据集，该语料库标注了大量的语言学信息，可以同时用于分词、NER 等任务。这里我们直接使用处理好的 NER 语料 china-people-daily-ner-corpus.tar.gz。
```

## readme for People's Daily(人民日报) dataset
### Task
Named Entity Recognition
### Description
**Tags**: LOC(地名), ORG(机构名), PER(人名)   
**Tag Strategy**：BIO  
**Split**: '*space*' (北 B-LOC)  
**Data Size**:  
Train data set ( [example.train](example.train) ):  

|句数|字符数|LOC数|ORG数|PER数|
|:-:|:-:|:-:|:-:|:-:|
|20864|979180|16571|9277|8144|

Dev data set ( [example.dev](example.dev) ):  

|句数|字符数|LOC数|ORG数|PER数|
|:-:|:-:|:-:|:-:|:-:|
|2318|109870|1951|984|884|

Test data set ( [example.test](example.test) )

|句数|字符数|LOC数|ORG数|PER数|
|:-:|:-:|:-:|:-:|:-:|
|4636|219197|3658|2185|1864|

**Reference**:   
<https://github.com/zjy-ucas/ChineseNER>

## 加载数据

In [1]:
from torch.utils.data import Dataset

categories = set()

class PeopleDaily(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}
        with open (data_file, "rt", encoding="utf-8") as f:
            # 文本使用空行进行分割句子
            for idx, line in enumerate(f.read().split("\n\n")):
                if not line:
                    break
                sentence, labels = "", []
                for i, item in enumerate(line.split("\n")):
                    char, tag = item.split(" ")
                    sentence += char
                    if tag.startswith("B"):
                        labels.append([i, i, char, tag[2:]])   # Remove the B- or I-
                        categories.add(tag[2:])
                    elif tag.startswith("I"):
                        labels[-1][1] = i
                        labels[-1][2] += char
                Data[idx] = {
                    "sentence" : sentence,
                    "labels" : labels
                }
        return Data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    

```markdown
中 B-LOC
国 I-LOC
人 O
民 O
银 B-ORG
行 I-ORG```

- 对应的输出为

```python
{
    'sentence': '中国人民银行',
    'labels': [
        [0, 1, "中国", "LOC"],    # 位置0-1的"中国"是地点
        [4, 5, "银行", "ORG"]     # 位置4-5的"银行"是组织机构
    ]
}```

In [2]:
train_data = PeopleDaily('dataset/PeopleDaily/example.train')
valid_data = PeopleDaily('dataset/PeopleDaily/example.dev')
test_data = PeopleDaily('dataset/PeopleDaily/example.test')

print(train_data[0], "\n", train_data[1], "\n", categories)

{'sentence': '海钓比赛地点在厦门与金门之间的海域。', 'labels': [[7, 8, '厦门', 'LOC'], [10, 11, '金门', 'LOC']]} 
 {'sentence': '这座依山傍水的博物馆由国内一流的设计师主持设计，整个建筑群精美而恢宏。', 'labels': []} 
 {'PER', 'LOC', 'ORG'}


## 数据预处理

很容易的我们建立以下的标签mapping

In [3]:
id2label = {0 : "O"}
for l in list(sorted(categories)):
    id2label[len(id2label)] = f"B-{l}"   # 使用当前字典长度作为新索引
    id2label[len(id2label)] = f"I-{l}"   # 长度已增加，所以这是下一个索引
label2id = {v : k for k,v in id2label.items()} # kv互换

print(id2label)
print(label2id)

{0: 'O', 1: 'B-LOC', 2: 'I-LOC', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-PER', 6: 'I-PER'}
{'O': 0, 'B-LOC': 1, 'I-LOC': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-PER': 5, 'I-PER': 6}


In [4]:
# 示例，把尸体标签转化为实体编号
from transformers import AutoTokenizer
import numpy as np

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence = '海钓比赛地点在厦门与金门之间的海域。'
labels = [[7, 8, '厦门', 'LOC'], [10, 11, '金门', 'LOC']]

encoding = tokenizer(sentence, truncation=True)
tokens = encoding.tokens()
label = np.zeros(len(tokens), dtype=int)

for char_start, char_end, word, tag in labels:

    token_start = encoding.char_to_token(char_start)
    token_end = encoding.char_to_token(char_end)

    label[token_start] = label2id[f"B-{tag}"]
    label[token_start +1 : token_end +1] = label2id[f"I-{tag}"]


print(tokens)
print(label)
print([id2label[id] for id in label])


  from .autonotebook import tqdm as notebook_tqdm


['[CLS]', '海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。', '[SEP]']
[0 0 0 0 0 0 0 0 1 2 0 1 2 0 0 0 0 0 0 0]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [5]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import numpy as np

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence, batch_tag = [], []
    for sample in batch_samples:
        batch_sentence.append(sample["sentence"])
        batch_tag.append(sample["labels"])
    batch_inputs = tokenizer(
        batch_sentence,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    batch_label = np.zeros(batch_inputs["input_ids"].shape, dtype=int)
    for s_idx, sentence in enumerate(batch_sentence):
        encoding = tokenizer(sentence, truncation=True)

        # 特殊标签[CLS] [SEP] ... 屏蔽
        batch_label[s_idx][0] = -100
        batch_label[s_idx][len(encoding.tokens())-1:] = -100

        for char_start, char_end, _, tag in batch_tag[s_idx]:

            token_start = encoding.char_to_token(char_start)
            token_end = encoding.char_to_token(char_end)

            batch_label[s_idx][token_start] = label2id[f"B-{tag}"]
            batch_label[s_idx][token_start +1 : token_end +1] = label2id[f"I-{tag}"]
    return batch_inputs, torch.tensor(batch_label)

In [6]:
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=8, shuffle=False, collate_fn=collote_fn)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=False, collate_fn=collote_fn)


In [7]:
batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([8, 56]), 'token_type_ids': torch.Size([8, 56]), 'attention_mask': torch.Size([8, 56])}
batch_y shape: torch.Size([8, 56])
{'input_ids': tensor([[ 101,  754, 3221, 8024, 1072, 3300, 1126, 1282, 2399,  837, 5320, 4638,
         1921, 3823, 6639, 4413, 1762, 1059, 3173, 6225, 2573,  722,  678, 6672,
          677, 3173, 4638, 6629, 6651, 5296,  511,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 1745,  711, 1762, 6421, 1905, 1374, 2458, 4638, 2876, 2360, 3119,
         2530, 5041, 5276,  811, 2466,  677, 8024, 2530, 2475,  812, 5314, 2360,
          987, 2847,  677, 5273, 2506, 8024,  809, 6134, 2697, 6468,  722, 2658,
          511,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 1744, 7354, 6639, 5468, 3173,  712, 2375, 23

## 模型训练

In [8]:
from torch import nn
from transformers import AutoConfig
from transformers import BertPreTrainedModel, BertModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

class BertForNER(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(768, len(id2label))
        self.post_init()

    def forward(self, x):
        bert_output = self.bert(**x)
        sequence_output = bert_output.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits

config = AutoConfig.from_pretrained(checkpoint)
model = BertForNER.from_pretrained(checkpoint, config=config).to(device)
print(model)

Using cuda device


Some weights of BertForNER were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForNER(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [9]:
outputs = model(batch_X.to(device))
print(outputs.shape)

torch.Size([8, 56, 7])


优化模型参数：
- 对于高维输出，交叉熵损失需要维度对齐 
- (batch, seq_len, label_num) $\rightarrow$ (batch, label_num, seq_len) 

In [10]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    for batch, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred.permute(0, 2, 1), y) # 交换维度

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

验证/测试循环负责评估模型的性能。这里我们借助 seqeval 库进行评估，seqeval 是一个专门用于序列标注评估的 Python 库，支持 IOB、IOB、IOBES 等多种标注格式以及多种评估策略，例如：

In [11]:
import seqeval
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

y_true = [['O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'O'], ['B-PER', 'I-PER', 'O']]

print(classification_report(y_true, y_pred,  mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

         LOC       0.50      0.50      0.50         2
         PER       1.00      1.00      1.00         1

   micro avg       0.67      0.67      0.67         3
   macro avg       0.75      0.75      0.75         3
weighted avg       0.67      0.67      0.67         3



这里几个具体指标的含义：
1. precision（精确率）
   - 定义：
     $$
     \text{Precision} = \frac{TP}{TP + FP}
     $$
   - TP（True Positive）：正确预测为该类的数量。
   - FP（False Positive）：错误预测为该类的数量。
   - 含义：在所有被预测为某个类别的实体中，有多少是真实正确的。

2. recall（召回率）
   - 定义：
     $$
     \text{Recall} = \frac{TP}{TP + FN}
     $$
   - FN（False Negative）：真实属于该类，但预测成别的类的数量。
   - 含义：在所有真实存在的某类实体中，模型能识别出来多少。
   - 作用：衡量模型覆盖真实标签的能力。高召回率意味着漏报少（例如，大部分真实人名都被模型识别出来）。

3. f1-score（F1 值）
   - 定义：精确率和召回率的调和平均：
     $$
     \text{F1} = \frac{2 \cdot \text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
     $$
   - 含义：平衡考虑了“少报”和“多报”两方面。
   - 作用：当 precision 和 recall 需要综合考虑时，F1 是一个更公平的评价指标。对于 NER 这种要求同时“发现实体”又“分类正确”的任务，F1 是最常用指标。

4. support
   - 定义：数据集中该类别真实样本的数量。

可以看到，对于第一个地点实体，模型虽然预测正确了其中 2 个 token 的标签，但是仍然判为识别错误，只有当预测的**起始和结束位置都正确时才算识别正确**。

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

def test_loop(dataloader, model):
    true_labels, true_predictions = [], []

    model.eval()
    with torch.no_grad():
        for X, y in tqdm(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            predictions = pred.argmax(dim=-1).cpu().numpy().tolist()
            labels = y.cpu().numpy().tolist()
            true_labels += [[id2label[int(l)] for l in label if l != -100] for label in labels]
            true_predictions += [
                [id2label[int(p)] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    print(classification_report(true_labels, true_predictions, mode='strict', scheme=IOB2))

In [19]:
type(train_data)

__main__.PeopleDaily

In [20]:
from torch.optim import AdamW
from transformers import get_scheduler

lr = 1e-5
epoch = 3
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch*len(train_dataloader)
)

total_loss = 0
for t in range(epoch):
    print(f"Epoch {t+1}/{epoch}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    test_loop(valid_dataloader, model)
print("Done!")


Epoch 1/3
-------------------------------


loss: 0.000000:   0%|                                                                         | 0/2608 [00:00<?, ?it/s]

loss: 0.059594: 100%|██████████████████████████████████████████████████████████████| 2608/2608 [05:49<00:00,  7.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [00:11<00:00, 25.22it/s]


              precision    recall  f1-score   support

         LOC       0.94      0.95      0.94      1951
         ORG       0.89      0.90      0.89       984
         PER       0.97      0.98      0.97       884

   micro avg       0.94      0.94      0.94      3819
   macro avg       0.93      0.94      0.94      3819
weighted avg       0.94      0.94      0.94      3819

Epoch 2/3
-------------------------------


loss: 0.038600: 100%|██████████████████████████████████████████████████████████████| 2608/2608 [05:44<00:00,  7.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [00:11<00:00, 25.40it/s]


              precision    recall  f1-score   support

         LOC       0.97      0.96      0.96      1951
         ORG       0.92      0.91      0.92       984
         PER       0.98      0.98      0.98       884

   micro avg       0.96      0.95      0.96      3819
   macro avg       0.96      0.95      0.95      3819
weighted avg       0.96      0.95      0.96      3819

Epoch 3/3
-------------------------------


loss: 0.028850: 100%|██████████████████████████████████████████████████████████████| 2608/2608 [05:53<00:00,  7.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [00:14<00:00, 19.74it/s]


              precision    recall  f1-score   support

         LOC       0.97      0.96      0.97      1951
         ORG       0.93      0.92      0.92       984
         PER       0.99      0.98      0.98       884

   micro avg       0.96      0.96      0.96      3819
   macro avg       0.96      0.95      0.96      3819
weighted avg       0.96      0.96      0.96      3819

Done!


## 保存模型
在实际应用中，我们会根据每一轮模型在验证集上的性能来调整超参数以及选出最好的权重，最后将选出的模型应用于测试集以评估最终的性能。因此，我们首先在上面的验证/测试循环中返回 seqeval 库计算出的指标，然后在每一个 Epoch 中根据 macro-F1/micro-F1 指标保存在验证集上最好的模型：

In [21]:
def test_loop(dataloader, model):
    true_labels, true_predictions = [], []

    model.eval()
    with torch.no_grad():
        for X, y in tqdm(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            predictions = pred.argmax(dim=-1).cpu().numpy().tolist()
            labels = y.cpu().numpy().tolist()
            true_labels += [[id2label[int(l)] for l in label if l != -100] for label in labels]
            true_predictions += [
                [id2label[int(p)] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    print(classification_report(true_labels, true_predictions, mode='strict', scheme=IOB2))
    return classification_report(
      true_labels, 
      true_predictions, 
      mode='strict', 
      scheme=IOB2, 
      output_dict=True
    )

total_loss = 0.
best_f1 = 0.
for t in range(epoch):
    print(f"Epoch {t+1}/{epoch}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    metrics = test_loop(valid_dataloader, model)
    valid_macro_f1, valid_micro_f1 = metrics['macro avg']['f1-score'], metrics['micro avg']['f1-score']
    valid_f1 = metrics['weighted avg']['f1-score']
    if valid_f1 > best_f1:
        best_f1 = valid_f1
        print('saving new weights...\n')
        torch.save(
            model.state_dict(), 
            f'epoch_{t+1}_valid_macrof1_{(100*valid_macro_f1):0.3f}_microf1_{(100*valid_micro_f1):0.3f}_weights.bin'
        )
print("Done!")

Epoch 1/3
-------------------------------


loss: 0.006918: 100%|██████████████████████████████████████████████████████████████| 2608/2608 [06:13<00:00,  6.98it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [00:12<00:00, 23.97it/s]


              precision    recall  f1-score   support

         LOC       0.97      0.96      0.97      1951
         ORG       0.93      0.92      0.92       984
         PER       0.99      0.98      0.98       884

   micro avg       0.96      0.96      0.96      3819
   macro avg       0.96      0.95      0.96      3819
weighted avg       0.96      0.96      0.96      3819

saving new weights...

Epoch 2/3
-------------------------------


loss: 0.006885: 100%|██████████████████████████████████████████████████████████████| 2608/2608 [06:22<00:00,  6.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [00:12<00:00, 23.35it/s]


              precision    recall  f1-score   support

         LOC       0.97      0.96      0.97      1951
         ORG       0.93      0.92      0.92       984
         PER       0.99      0.98      0.98       884

   micro avg       0.96      0.96      0.96      3819
   macro avg       0.96      0.95      0.96      3819
weighted avg       0.96      0.96      0.96      3819

Epoch 3/3
-------------------------------


loss: 0.006930: 100%|██████████████████████████████████████████████████████████████| 2608/2608 [05:55<00:00,  7.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [00:11<00:00, 25.17it/s]


              precision    recall  f1-score   support

         LOC       0.97      0.96      0.97      1951
         ORG       0.93      0.92      0.92       984
         PER       0.99      0.98      0.98       884

   micro avg       0.96      0.96      0.96      3819
   macro avg       0.96      0.95      0.96      3819
weighted avg       0.96      0.96      0.96      3819

Done!


## 测试模型 & 保存预测结果
模型的输出是一个由预测向量组成的列表，每个向量对应一个 token 的预测结果，只需要在输出 logits 值上运用 softmax 函数就可以获得实体类别的预测概率。

In [25]:
sentence = '日本外务省3月18日发布消息称，日本首相岸田文雄将于19至21日访问印度和柬埔寨。'

model.load_state_dict(
    torch.load('./model/epoch_1_valid_macrof1_95.786_microf1_95.912_weights.bin', map_location=torch.device(device))
)
model.eval()
results = []
with torch.no_grad():
    # offset_mapping（只有 fast tokenizer 且 return_offsets_mapping=True 时才有）。
    # offset_mapping 的形状通常是 (1, seq_len, 2)，
    # 每个 token 对应一个 (start_char, end_char) 的元组（字符级索引，基于原始 sentence）。
    inputs = tokenizer(sentence, truncation=True, return_tensors="pt", 
                       return_offsets_mapping=True)
    offsets = inputs.pop('offset_mapping').squeeze(0)
    inputs = inputs.to(device)
    pred = model(inputs)
    probabilities = torch.nn.functional.softmax(pred, dim=-1)[0].cpu().numpy().tolist()
    predictions = pred.argmax(dim=-1)[0].cpu().numpy().tolist()

    pred_label = []
    idx = 0
    while idx < len(predictions):
        pred = predictions[idx]
        label = id2label[pred]
        if label != "O":
            label = label[2:] # Remove the B- or I-
            start, end = offsets[idx]
            all_scores = [probabilities[idx][pred]]
            # Grab all the tokens labeled with I-label
            while (
                idx + 1 < len(predictions) and 
                id2label[predictions[idx + 1]] == f"I-{label}"
            ):
                all_scores.append(probabilities[idx + 1][predictions[idx + 1]])
                _, end = offsets[idx + 1]
                idx += 1

            score = np.mean(all_scores).item()
            start, end = start.item(), end.item()
            word = sentence[start:end]
            pred_label.append(
                {
                    "entity_group": label,
                    "score": score,
                    "word": word,
                    "start": start,
                    "end": end,
                }
            )
        idx += 1

  torch.load('./model/epoch_1_valid_macrof1_95.786_microf1_95.912_weights.bin', map_location=torch.device(device))


In [29]:
pred_label

[{'entity_group': 'ORG',
  'score': 0.9992446899414062,
  'word': '日本外务省',
  'start': 0,
  'end': 5},
 {'entity_group': 'LOC',
  'score': 0.9975488781929016,
  'word': '日本',
  'start': 16,
  'end': 18},
 {'entity_group': 'PER',
  'score': 0.9988918155431747,
  'word': '岸田文雄',
  'start': 20,
  'end': 24},
 {'entity_group': 'LOC',
  'score': 0.9993538856506348,
  'word': '印度',
  'start': 34,
  'end': 36},
 {'entity_group': 'LOC',
  'score': 0.9988286892573038,
  'word': '柬埔寨',
  'start': 37,
  'end': 40}]

还可以扩展上面的代码进行数据集（测试集）的处理，并把预测的结果保存在json格式的文件里

In [30]:
import json

model.load_state_dict(
    torch.load('./model/epoch_1_valid_macrof1_95.786_microf1_95.912_weights.bin', map_location=torch.device('cpu'))
)
model.eval()
with torch.no_grad():
    print('evaluating on test set...')
    true_labels, true_predictions = [], []
    for X, y in tqdm(test_dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        predictions = pred.argmax(dim=-1).cpu().numpy().tolist()
        labels = y.cpu().numpy().tolist()
        true_labels += [[id2label[int(l)] for l in label if l != -100] for label in labels]
        true_predictions += [
            [id2label[int(p)] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
    print(classification_report(true_labels, true_predictions, mode='strict', scheme=IOB2))
    results = []
    print('predicting labels...')
    for s_idx in tqdm(range(len(test_data))):
        example = test_data[s_idx]
        inputs = tokenizer(example['sentence'], truncation=True, return_tensors="pt")
        inputs = inputs.to(device)
        pred = model(inputs)
        probabilities = torch.nn.functional.softmax(pred, dim=-1)[0].cpu().numpy().tolist()
        predictions = pred.argmax(dim=-1)[0].cpu().numpy().tolist()

        pred_label = []
        inputs_with_offsets = tokenizer(example['sentence'], return_offsets_mapping=True)
        tokens = inputs_with_offsets.tokens()
        offsets = inputs_with_offsets["offset_mapping"]

        idx = 0
        while idx < len(predictions):
            pred = predictions[idx]
            label = id2label[pred]
            if label != "O":
                label = label[2:] # Remove the B- or I-
                start, end = offsets[idx]
                all_scores = [probabilities[idx][pred]]
                # Grab all the tokens labeled with I-label
                while (
                    idx + 1 < len(predictions) and 
                    id2label[predictions[idx + 1]] == f"I-{label}"
                ):
                    all_scores.append(probabilities[idx + 1][predictions[idx + 1]])
                    _, end = offsets[idx + 1]
                    idx += 1

                score = np.mean(all_scores).item()
                word = example['sentence'][start:end]
                pred_label.append(
                    {
                        "entity_group": label,
                        "score": score,
                        "word": word,
                        "start": start,
                        "end": end,
                    }
                )
            idx += 1
        results.append(
            {
                "sentence": example['sentence'], 
                "pred_label": pred_label, 
                "true_label": example['labels']
            }
        )
    with open('test_data_pred.json', 'wt', encoding='utf-8') as f:
        for exapmle_result in results:
            f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n')

  torch.load('./model/epoch_1_valid_macrof1_95.786_microf1_95.912_weights.bin', map_location=torch.device('cpu'))


evaluating on test set...


100%|████████████████████████████████████████████████████████████████████████████████| 580/580 [00:23<00:00, 24.51it/s]


              precision    recall  f1-score   support

         LOC       0.95      0.95      0.95      3658
         ORG       0.91      0.91      0.91      2185
         PER       0.98      0.98      0.98      1864

   micro avg       0.95      0.95      0.95      7707
   macro avg       0.95      0.95      0.95      7707
weighted avg       0.95      0.95      0.95      7707

predicting labels...


100%|██████████████████████████████████████████████████████████████████████████████| 4636/4636 [02:14<00:00, 34.36it/s]


In [None]:
from datasets import load_dataset

data = load_dataset("json", data_files="./dataset/test_data_pred.json", split="train", lines=True)

In [37]:
data = []
with open("./dataset/test_data_pred.json", 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))

In [39]:
data[1082]

{'sentence': '李瑞环向莫诺里介绍了中国人民政协的情况。',
 'pred_label': [{'entity_group': 'PER',
   'score': 0.9997104406356812,
   'word': '李瑞环',
   'start': 0,
   'end': 3},
  {'entity_group': 'PER',
   'score': 0.999658465385437,
   'word': '莫诺里',
   'start': 4,
   'end': 7},
  {'entity_group': 'ORG',
   'score': 0.9993860423564911,
   'word': '中国人民政协',
   'start': 10,
   'end': 16}],
 'true_label': [[0, 2, '李瑞环', 'PER'],
  [4, 6, '莫诺里', 'PER'],
  [10, 15, '中国人民政协', 'ORG']]}