In [1]:
import functools
import json
import os
import time

import paddle
from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorForTokenClassification
from paddlenlp.datasets import load_dataset
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.transformers import AutoModelForTokenClassification
from paddlenlp.transformers import AutoTokenizer

  resample=Image.BILINEAR,
  resample=Image.NEAREST,
  resample=Image.BICUBIC,
  resample=Image.BICUBIC,


In [2]:
# data arguments
train_file_path = "datasets/msra_ner/train.tsv"
test_file_path = "datasets/msra_ner/test.tsv"
label_map_file_path = "datasets/msra_ner/label_map.json"
max_seq_length = 128

# model arguments
model_name = "ernie-3.0-medium-zh"

# paddle training arguments
batch_size = 32
learning_rate = 2e-5
epochs = 10
ckpt_dir = "ernie_ckpt/ernie-3.0-medium-zh-msra-ner"

In [3]:
def read(data_path):
    with open(data_path, "r") as fin:
        for line in fin:
            line = line.rstrip()
            tokens_str, labels_str = line.split("\t")
            tokens = tokens_str.split("\002")
            labels = labels_str.split("\002")
            yield {"tokens": tokens, "labels": labels}

In [4]:
# train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False)

train_ds = load_dataset(read, data_path=train_file_path, lazy=False)
test_ds = load_dataset(read, data_path=test_file_path, lazy=False)

In [5]:
# all_labels = train_ds.label_list
# label_2_label_id = {label: i for i, label in enumerate(all_labels)}
# label_id_2_label = {label_id: label for label, label_id in label_2_label_id.items()}
# num_classes = len(all_labels)

label_map = json.load(open(label_map_file_path, "r"))
num_classes = len(label_map)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_faster=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_classes=num_classes)

[32m[2022-09-28 14:43:52,908] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-09-28 14:43:52,909] [    INFO][0m - Already cached /Users/gerry.xu/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh_vocab.txt[0m
[32m[2022-09-28 14:43:52,925] [    INFO][0m - tokenizer config file saved in /Users/gerry.xu/.paddlenlp/models/ernie-3.0-medium-zh/tokenizer_config.json[0m
[32m[2022-09-28 14:43:52,926] [    INFO][0m - Special tokens file saved in /Users/gerry.xu/.paddlenlp/models/ernie-3.0-medium-zh/special_tokens_map.json[0m
[32m[2022-09-28 14:43:52,927] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForTokenClassification'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-09-28 14:43:52,928] [    INFO][0m - Already cached /Users/gerry.xu/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh.pdparams[0m


In [7]:
def _preprocess(example, tokenizer, label_map, max_seq_length=128):
    tokens = example["tokens"]  # list of tokens
    labels = [label_map[label] for label in example["labels"]]  # list of label ids
    no_entity_id = label_map["O"]

    tokens_encoded = tokenizer(tokens, return_length=True, is_split_into_words=True, max_seq_len=max_seq_length)

    input_ids_len = len(tokens_encoded["input_ids"])  # input_ids_len = max_seq_len
    # 如果 input_ids_len - 2 < len(labels)，说明输入的 tokens 的长度超过 max_seq_len，被截断了
    if input_ids_len - 2 < len(labels):
        labels = labels[:input_ids_len - 2]
    tokens_encoded["labels"] = [no_entity_id] + labels + [no_entity_id]
    tokens_encoded["labels"] += [no_entity_id] * (input_ids_len - len(tokens_encoded["labels"]))

    return tokens_encoded

In [8]:
trans_func = functools.partial(_preprocess, tokenizer=tokenizer, label_map=label_map, max_seq_length=max_seq_length)
train_ds = train_ds.map(trans_func)
test_ds = test_ds.map(trans_func)

# collate_fn 函数将不同长度序列补齐到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, label_pad_token_id=-1)

In [9]:
train_batch_sampler = BatchSampler(train_ds, batch_size=batch_size, shuffle=True)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)

test_batch_sampler = BatchSampler(test_ds, batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)

In [10]:
# AdamW 优化器、交叉熵损失函数、ChunkEvaluator 评价指标
optimizer = paddle.optimizer.AdamW(learning_rate=learning_rate, parameters=model.parameters())
loss_obj = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
metric_obj = ChunkEvaluator(label_list=list(label_map.keys()))

In [11]:
@paddle.no_grad()
def evaluate(data_loader, model, metric):
    model.eval()
    metric.reset()

    precision, recall, f1_score = 0, 0, 0
    for step, batch in enumerate(data_loader, start=1):
        input_ids, token_type_ids, labels, lens = batch['input_ids'], batch['token_type_ids'], batch['labels'], batch['seq_len']
        logits = model(input_ids, token_type_ids)
        preds = paddle.argmax(logits, axis=-1)
        n_infer, n_label, n_correct = metric.compute(lens, preds, labels)
        metric.update(n_infer.numpy(), n_label.numpy(), n_correct.numpy())
        precision, recall, f1_score = metric.accumulate()

    print("eval precision: %.6f - recall: %.6f - f1: %.6f" % (precision, recall, f1_score))
    model.train()
    return precision, recall, f1_score

In [12]:
global_step = 0  # 全局迭代次数
best_step = 0
best_f1_score = 0

tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值
        logits = model(input_ids, token_type_ids)
        loss = paddle.mean(loss_obj(logits, labels))

        # 每迭代 10 次，打印损失函数值、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.6f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))
            )
            tic_train = time.time()

        # 反向梯度回传
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        # 每迭代 200 次，评估当前训练的模型、保存当前最佳模型参数和分词器的词表等
        if global_step % 200 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            print('global_step', global_step, end=' ')
            _, _, eval_f1_score = evaluate(test_data_loader, model, metric_obj)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                best_step = global_step

                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

KeyboardInterrupt: 