In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate  
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载hf中dataset
ds = load_dataset('ds_msra_ner')
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})

In [3]:
for items in ds['train']:
    print(items['tokens'])
    print(items['ner_tags'])
    break

['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

### 实体映射字典

'O':0   
'B-PER':1   
'I-PER':2   
'B-LOC':3   
'I-LOC':4   
'B-ORG':5   
'I-ORG':6   

In [5]:
# 验证tag标签数量
tags_id = set()
for items in ds['train']:
    tags_id.update(items['ner_tags'])
    
tags_id

{0, 1, 2, 3, 4, 5, 6}

In [6]:
# entity_index
entites = ['O'] + list({'PER', 'LOC', 'ORG'})
tags = ['O']
for entity in entites[1:]:
    tags.append('B-' + entity.upper())
    tags.append('I-' + entity.upper())

entity_index = {entity:i for i, entity in enumerate(entites)}


In [7]:
entity_index

{'O': 0, 'ORG': 1, 'PER': 2, 'LOC': 3}

In [8]:
tags

['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

In [None]:
def data_input_proc(item):
    # 文本已经分为字符，且tag索引也已经提供
    # 所以数据预处理反而简单
    # 导入已拆分为字符的列表，需要设置参数is_split_into_words=True
    input_data = tokenizer(item['tokens'], 
                           truncation=True,
                           add_special_tokens=False, 
                           max_length=512, 
                           is_split_into_words=True,
                           return_offsets_mapping=True)
    
    labels = [lbl[:512] for lbl in item['ner_tags']]
    input_data['labels'] = labels
    return input_data

ds1 = ds.map(data_input_proc, batched=True)

In [10]:
ds1.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [11]:
for item in ds1['train']:
    print(item)
    break

{'input_ids': tensor([2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636,  674, 1036, 4997,
        2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768,
        7599, 3198, 8024,  791, 1921, 3300, 3119, 5966,  817,  966, 4638,  741,
         872, 3766,  743, 8024, 3209, 3189, 2218, 1373,  872, 2637,  679, 2496,
        1159, 8013]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])}


### 构建模型对象

In [12]:
id2lbl = {i:tag for i, tag in enumerate(tags)}
lbl2id = {tag:i for i, tag in enumerate(tags)}

model = AutoModelForTokenClassification.from_pretrained('bert-base-chinese', 
                                                        num_labels=len(tags),
                                                        id2label=id2lbl,
                                                        label2id=lbl2id)
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

### 模型训练 TrainningArguments

In [13]:
args = TrainingArguments(
    output_dir="msra_ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
    num_train_epochs = 3,    # 训练 epoch
    # save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
    per_device_train_batch_size=32,  # 训练批次
    per_device_eval_batch_size=32,
    report_to='tensorboard',  # 训练输出记录
    eval_strategy="epoch",
)

### 模型训练 Trainer

In [14]:
# metric 方法
def compute_metric(result):
    # result 是一个tuple (predicts, labels)
    
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts,labels = result
    predicts = np.argmax(predicts, axis=2)
    
    # 准备评估数据
    predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    results = seqeval.compute(predictions=predicts, references=labels)

    return results

In [15]:
# import evaluate 
# evaluate.load('seqeval')

In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds1['train'],
    eval_dataset=ds1['test'],
    data_collator=data_collator,
    compute_metrics=compute_metric
)

**模型训练**

In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,Loc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0605,0.028575,"{'precision': 0.9412798874824191, 'recall': 0.9386395511921458, 'f1': 0.9399578651685394, 'number': 2852}","{'precision': 0.939894319682959, 'recall': 0.9467731204258151, 'f1': 0.9433211799801128, 'number': 1503}","{'precision': 0.8098542678695351, 'recall': 0.884090909090909, 'f1': 0.8453458891705904, 'number': 1320}",0.90826,0.928106,0.918076,0.991366
2,0.0195,0.029246,"{'precision': 0.9527195165303946, 'recall': 0.9396914446002805, 'f1': 0.9461606354810238, 'number': 2852}","{'precision': 0.9425587467362925, 'recall': 0.9607451763140386, 'f1': 0.9515650741350906, 'number': 1503}","{'precision': 0.856638418079096, 'recall': 0.918939393939394, 'f1': 0.8866959064327485, 'number': 1320}",0.926402,0.940441,0.933368,0.992588
3,0.0064,0.032138,"{'precision': 0.9519604380077711, 'recall': 0.9449509116409537, 'f1': 0.9484427239134259, 'number': 2852}","{'precision': 0.9466089466089466, 'recall': 0.8729208250166334, 'f1': 0.9082727587400484, 'number': 1503}","{'precision': 0.8678038379530917, 'recall': 0.925, 'f1': 0.8954895489548955, 'number': 1320}",0.929587,0.921233,0.925392,0.992335


Downloading builder script: 100%|██████████| 6.34k/6.34k [00:00<00:00, 16.3MB/s]
Trainer is attempting to log a value of "{'precision': 0.9412798874824191, 'recall': 0.9386395511921458, 'f1': 0.9399578651685394, 'number': 2852}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.939894319682959, 'recall': 0.9467731204258151, 'f1': 0.9433211799801128, 'number': 1503}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8098542678695351, 'recall': 0.884090909090909, 'f1': 0.8453458891705904, 'number': 1320}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to

TrainOutput(global_step=2112, training_loss=0.02358800316737457, metrics={'train_runtime': 1111.0185, 'train_samples_per_second': 121.513, 'train_steps_per_second': 1.901, 'total_flos': 1.180990200098808e+16, 'train_loss': 0.02358800316737457, 'epoch': 3.0})

**模型推理**

In [19]:
from transformers import pipeline

In [20]:
pipeline = pipeline('token-classification', 'msra_ner_train/checkpoint-2112')

Device set to use cuda:0


In [21]:
pipeline('双方确定了今后发展中美关系的指导方针')

[{'entity': 'B-LOC',
  'score': np.float32(0.99652195),
  'index': 10,
  'word': '中',
  'start': 9,
  'end': 10},
 {'entity': 'B-LOC',
  'score': np.float32(0.99696547),
  'index': 11,
  'word': '美',
  'start': 10,
  'end': 11}]