# 1.数据标注体系
##  IOB2标注
IOB2标注体系，即Inside-Outside-Beginning(INS-OUTS-BEGS)。
- I表示实体内部, O表示非实体，B表示实体的开始。
- B/I-XXX：表示一个单词属于XXX实体
## IOBES标注
IOBES标注体系，即Inside-Outside-Beginning-Single-End(INS-OUTS-BEGS-SINGLE-END)。
- I表示实体内部, O表示非实体，B表示实体的开始，E表示实体的结束, S表示一个词单独形成一个命名实体。
- 有时会用M替代I(middle -> inside)

# 2.评估指标
- Precision = TP / (TP + FP)
- Recall = TP / (TP + FN)
- F1 = 2 * Precision * Recall / (Precision + Recall)

# 3.基于Transformers的解决办法：
## 模型结构：
- **ModleForTokenClassification*
## 评估函数：
- **seqeval**

In [1]:
# 导入相关包
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
from datasets import load_dataset

In [2]:
# 导入数据集
# dataset = load_dataset("lansinuote/peoples-daily-ner", cache_dir="../dataset")
dataset = load_dataset('../dataset/peoples-daily-ner')
dataset, dataset['train']

(DatasetDict({
     train: Dataset({
         features: ['id', 'tokens', 'ner_tags'],
         num_rows: 20865
     })
     validation: Dataset({
         features: ['id', 'tokens', 'ner_tags'],
         num_rows: 2319
     })
     test: Dataset({
         features: ['id', 'tokens', 'ner_tags'],
         num_rows: 4637
     })
 }),
 Dataset({
     features: ['id', 'tokens', 'ner_tags'],
     num_rows: 20865
 }))

In [3]:
dataset['train'].features

{'id': Value('string'),
 'tokens': List(Value('string')),
 'ner_tags': List(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']))}

In [4]:
label_list = dataset['train'].features['ner_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [5]:
# 数据集预处理
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /hfl/chinese-macbert-base/resolve/main/tokenizer_config.json (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None)))"), '(Request ID: 79e830f0-2d52-43c1-a2f0-bc675408c45a)')' thrown while requesting HEAD https://huggingface.co/hfl/chinese-macbert-base/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /hfl/chinese-macbert-base/resolve/main/tokenizer_config.json (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None)))"), '(Request ID: 07623928-c045-4066-805d-8246ba87fdc0)')' thrown while requesting HEAD https://huggingface.co/hfl/chinese-macbert-base/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
'(MaxRetryError("HTTPSConnectionPool(h

In [6]:
# 一个词可能会拆开 因此之前的ner_tags可能并不使用于tokenized的句子
res = tokenizer('interesting World')
res, res.word_ids()

({'input_ids': [101, 10673, 12865, 12921, 8181, 8572, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
 [None, 0, 0, 0, 0, 1, None])

In [7]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=128)
    labels = []
    for i, label in enumerate(examples['ner_tags']):  #examples是按批次处理 i是批次索引
        word_ids = tokenized_examples.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:        #解决一个词会被拆分的问题
            if word_id is None:
                label_ids.append(-100)  #交叉熵损失函数默认不计算的损失值
            else:
                label_ids.append(label[word_id]) #获取标签
        labels.append(label_ids)
    tokenized_examples['labels'] = labels
    return tokenized_examples

dataset = dataset.map(process_function, batched=True,remove_columns=['tokens', 'ner_tags'])
dataset['train'][0]

{'id': '0',
 'input_ids': [101,
  3862,
  7157,
  3683,
  6612,
  1765,
  4157,
  1762,
  1336,
  7305,
  680,
  7032,
  7305,
  722,
  7313,
  4638,
  3862,
  1818,
  511,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}

In [None]:
# 创建模型
model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base",num_labels=len(label_list))


# 冻结BERT参数 只对分类头做微调
for param in model.bert.parameters():
    param.requires_grad = False

for name, param in model.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}, shape={param.shape}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert.embeddings.word_embeddings.weight: requires_grad=False, shape=torch.Size([21128, 768])
bert.embeddings.position_embeddings.weight: requires_grad=False, shape=torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight: requires_grad=False, shape=torch.Size([2, 768])
bert.embeddings.LayerNorm.weight: requires_grad=False, shape=torch.Size([768])
bert.embeddings.LayerNorm.bias: requires_grad=False, shape=torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight: requires_grad=False, shape=torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias: requires_grad=False, shape=torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight: requires_grad=False, shape=torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias: requires_grad=False, shape=torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight: requires_grad=False, shape=torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias: requires_grad=False, shape=torch.S

seqeval输入参数
1. suffix:指定实体类型标签在标记中的位置。
- False:默认模式，标签是前缀模式如B-PER, I-ORG, B-LOC, O
- True :后缀模式，如PER-B, ORG-I, LOC-O, O
2. mode:是否将实体类型正确但边界标记（I/B）错误的预测计入正确统计
- strict:严格模式，即只统计完全正确的预测
- None:默认模式，会将实体类型正确但边界标记（I/B）错误的预测也统计为正确

In [12]:
#创建评估函数
seqeval = evaluate.load("seqeval")
seqeval

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': List(Value('string')), 'references': List(Value('string'))}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of shape (n_samples,), weights for individual samples. default: None
    zero_division: Which value to substitute as a

In [13]:
import numpy as np

def eval_metirc(pred):
    preds, labels = pred
    preds = np.argmax(preds, axis=-1)
    true_preds , true_labels = [], []
    for pred, label in zip(preds, labels):
        ps , ls = [], []
        for p, l in zip(pred, label): 
            if l != -100:                        # 不是填充值
                ps.append(label_list[p])         # 转换为标签
                ls.append(label_list[l])
        true_preds.append(ps)
        true_labels.append(ls)
    result = seqeval.compute(predictions=true_preds, references=true_labels, scheme='IOB2', mode='strict')
    return{
        'f1' : result['overall_f1'],
        'recall' : result['overall_recall'],
        'precision' : result['overall_precision']
    }

In [14]:
# 配置训练参数
args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy='epoch',
    run_name='runs',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    logging_steps=50
)

In [15]:
# 创建训练器
trianer = Trainer(
    model = model,
    args = args,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    compute_metrics=eval_metirc,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

对于分类头只训练三轮还是可以看出来训练结果有点差的

In [16]:
trianer.train()

Epoch,Training Loss,Validation Loss,F1,Recall,Precision
1,0.3493,0.310229,0.044248,0.023243,0.459459
2,0.2426,0.221724,0.327681,0.235165,0.540201
3,0.2167,0.203672,0.388489,0.295324,0.567525


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1959, training_loss=0.39783096520373745, metrics={'train_runtime': 222.925, 'train_samples_per_second': 280.789, 'train_steps_per_second': 8.788, 'total_flos': 3668961649445598.0, 'train_loss': 0.39783096520373745, 'epoch': 3.0})

In [17]:
# 模型预测
from transformers import pipeline

In [31]:
pipe = pipeline('token-classification',model, tokenizer=tokenizer, aggregation_strategy='simple')

Device set to use cuda:0


In [32]:
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}
model.config

BertConfig {
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type"

In [33]:
pipe('小明在北京')

[{'entity_group': 'LOC',
  'score': np.float32(0.38904288),
  'word': '北',
  'start': 3,
  'end': 4}]