## 命名实体任务

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification

In [2]:
import  evaluate
from datasets import load_dataset

## 加载数据集

In [4]:
ner_datasets = load_dataset("peoples_daily_ner",cache_dir="./data",trust_remote_code= True )

Downloading data: 100%|██████████| 6.28M/6.28M [01:02<00:00, 101kB/s] 
Downloading data: 100%|██████████| 702k/702k [00:10<00:00, 69.5kB/s] 
Downloading data: 100%|██████████| 1.41M/1.41M [00:12<00:00, 115kB/s] 
Generating train split: 100%|██████████| 20865/20865 [00:05<00:00, 3598.29 examples/s]
Generating validation split: 100%|██████████| 2319/2319 [00:00<00:00, 3513.82 examples/s]
Generating test split: 100%|██████████| 4637/4637 [00:01<00:00, 3710.24 examples/s]


In [5]:
ner_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4637
    })
})

In [6]:
ner_datasets['train'][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [8]:
label_list = ner_datasets['train'].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

## 预处理

In [10]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
def process_function(example):
    tokenized_example = tokenizer(example["tokens"], max_length= 128, is_split_into_words= True, truncation = True)
    labels = []
    for i, label in enumerate(example["ner_tags"]):
        word_ids = tokenized_example.word_ids(batch_index = i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_example["labels"] = labels
    return tokenized_example
        
        
    
    

In [15]:
tokenized_dataset = ner_datasets.map(process_function,batched= True)
tokenized_dataset

Map: 100%|██████████| 20865/20865 [00:02<00:00, 8042.17 examples/s]
Map: 100%|██████████| 2319/2319 [00:00<00:00, 7089.06 examples/s]
Map: 100%|██████████| 4637/4637 [00:00<00:00, 7967.54 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4637
    })
})

In [22]:
print(tokenized_dataset['train'][0])

{'id': '0', 'tokens': ['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}


In [23]:
model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base",num_labels = len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 评估函数

In [24]:
seqeval = evaluate.load("seqeval")
seqeval

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [25]:
import numpy as np

def eval_metric(pred):
    predictions,labels = pred
    predictions = np.argmax(predictions,axis= -1 )
    
    true_pred = [
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions,labels)
    ]
    
    true_labels = [
        [label_list[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions,labels)
    ]
    
    result = seqeval.compute(predictions= true_pred, references= true_labels, scheme = "IOB2", mode = "strict")
    return {
        "f1" : result["overall_f1"]
    }

## 训练参数配置

In [26]:
args = TrainingArguments(
    output_dir= "model_for_ner",
    per_device_train_batch_size= 64,
    per_device_eval_batch_size= 128,
    eval_strategy= "epoch",
    save_strategy= "epoch",
    metric_for_best_model= "f1",
    load_best_model_at_end= True,
    logging_steps= 50
)

## 创建训练器

In [27]:
trainer = Trainer(
    model= model,
    args= args,
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["validation"],
    compute_metrics= eval_metric,
    data_collator= DataCollatorForTokenClassification(tokenizer= tokenizer)
)

## 训练

In [28]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.0279,0.020639,0.944635
2,0.0127,0.017483,0.951952
3,0.0063,0.017506,0.9562




TrainOutput(global_step=981, training_loss=0.028723309695659914, metrics={'train_runtime': 23220.5867, 'train_samples_per_second': 2.696, 'train_steps_per_second': 0.042, 'total_flos': 3940951205762142.0, 'train_loss': 0.028723309695659914, 'epoch': 3.0})

In [29]:
trainer.evaluate(eval_dataset= tokenized_dataset["test"])



{'eval_loss': 0.023619234561920166,
 'eval_f1': 0.948868959978584,
 'eval_runtime': 554.1418,
 'eval_samples_per_second': 8.368,
 'eval_steps_per_second': 0.067,
 'epoch': 3.0}

In [30]:
model.config.id2label = {idx : label for idx,label in enumerate(label_list)}

from transformers import pipeline
ner_pipe = pipeline("token-classification",model = model,tokenizer = tokenizer , aggregation_strategy = "simple")
ner_pipe("我在华南理工大学上学")

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'ORG',
  'score': 0.99864966,
  'word': '华 南 理 工 大 学',
  'start': 2,
  'end': 8}]