In [15]:
from transformers import TrainingArguments, Trainer
import torch
import numpy as np
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
import random
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report


In [4]:
model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=2)

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [6]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [7]:
# 模型测试
message= "命名实体识别"
label = torch.tensor([0,1,0,2,5,4])

model_input = tokenizer([message], return_tensors='pt')
result = model(**model_input)

print(result.loss)
print(result.logits)

None
tensor([[[-0.1416, -0.1800],
         [-0.5181, -0.2601],
         [-0.2403, -0.1895],
         [-0.1746, -0.5615],
         [-0.0370, -0.3543],
         [ 0.3611, -0.1045],
         [ 0.0333, -0.3758],
         [ 0.1231,  0.1878]]], grad_fn=<ViewBackward0>)


In [8]:
# 加载hf中dataset
dataset = load_dataset('doushabao4766/msra_ner_k_V3')
dataset

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

(…)-00000-of-00001-42717a92413393f9.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

(…)-00000-of-00001-8899cab5fdab45bc.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3443 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})

## 实体映射数据集词典准备

In [9]:
label_list = dataset["train"].features["ner_tags"].feature.names
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != -100 else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/45001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForTokenClassification.from_pretrained(
    'google-bert/bert-base-chinese', 
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="./ner-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to='tensorboard',  # 训练输出记录
    num_train_epochs=3,
    # weight_decay=0.01,
    # save_total_limit=2,
    save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
    # load_best_model_at_end=True,
    # metric_for_best_model="f1",
    # logging_dir="./logs",
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "f1": seqeval.metrics.f1_score(true_labels, true_predictions),
        "precision": seqeval.metrics.precision_score(true_labels, true_predictions),
        "recall": seqeval.metrics.recall_score(true_labels, true_predictions),
    }


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    # tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0055,0.041713,0.910263,0.900069,0.920691
2,0.0065,0.030682,0.945305,0.938815,0.951886
3,0.0021,0.036341,0.944957,0.941403,0.948537




TrainOutput(global_step=4221, training_loss=0.005060984551186416, metrics={'train_runtime': 1021.8536, 'train_samples_per_second': 132.116, 'train_steps_per_second': 4.131, 'total_flos': 9848563776930252.0, 'train_loss': 0.005060984551186416, 'epoch': 3.0})

In [20]:
def extract_entities(sentence, model, tokenizer, id2label):
    tokens = list(sentence)

    # 分词 + 放入模型设备
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # 模型预测
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    # 标签解码
    labels = [id2label[idx] for idx in predictions]

    # BIO 解码成实体块
    entities = []
    entity = None
    for token, label in zip(tokens, labels):
        if label.startswith("B-"):
            if entity:
                entities.append(entity)
            entity = {"entity": label[2:], "content": token}
        elif label.startswith("I-") and entity and label[2:] == entity["entity"]:
            entity["content"] += token
        else:
            if entity:
                entities.append(entity)
                entity = None
    if entity:
        entities.append(entity)

    return entities


In [21]:
sentence = "双方确定了今后发展中美关系的指导方针。"
entities = extract_entities(sentence, model, tokenizer, id2label)
print(entities)


[{'entity': 'LOC', 'content': '美'}, {'entity': 'LOC', 'content': '关'}]
