In [59]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

In [60]:
ner_data = load_dataset('peoples_daily_ner')

Found cached dataset peoples_daily_ner (/Users/vincent/.cache/huggingface/datasets/peoples_daily_ner/peoples_daily_ner/1.0.0/594461a1b34f61af9346123a420b9ea40f15c0e835562053bf025cef188477f5)
100%|██████████| 3/3 [00:00<00:00, 968.36it/s]


In [61]:
ner_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4637
    })
})

In [62]:
ner_data['train'][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [63]:
ner_data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}

In [64]:
label_list = ner_data['train'].features['ner_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [65]:
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-base')

In [66]:
tokenizer(ner_data['train'][0]['tokens'], is_split_into_words=True)


{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [67]:
inputs = tokenizer('interesting word')
inputs

{'input_ids': [101, 10673, 12865, 12921, 8181, 8681, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [68]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, None]

In [69]:
# examples是一个批次的数据
def process_func(examples):
    inputs = tokenizer(examples['tokens'], max_length=128, truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        this_label = []
        word_ids = inputs.word_ids(batch_index=i)
        # print(word_ids)
        for j in word_ids:
            if j is None:
                this_label.append(-100)
            else:
                this_label.append(label[j])
        labels.append(this_label)
    inputs['labels'] = labels
    return inputs

In [70]:
len(ner_data['train']['ner_tags'])

20865

In [71]:
processed_ner_data = ner_data.map(process_func, batched=True)

Loading cached processed dataset at /Users/vincent/.cache/huggingface/datasets/peoples_daily_ner/peoples_daily_ner/1.0.0/594461a1b34f61af9346123a420b9ea40f15c0e835562053bf025cef188477f5/cache-8bc67b0699ceee7f.arrow
Loading cached processed dataset at /Users/vincent/.cache/huggingface/datasets/peoples_daily_ner/peoples_daily_ner/1.0.0/594461a1b34f61af9346123a420b9ea40f15c0e835562053bf025cef188477f5/cache-4acca6074fbb5bed.arrow


In [72]:
processed_ner_data['train'][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0],
 'input_ids': [101,
  3862,
  7157,
  3683,
  6612,
  1765,
  4157,
  1762,
  1336,
  7305,
  680,
  7032,
  7305,
  722,
  7313,
  4638,
  3862,
  1818,
  511,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}

In [88]:
model = AutoModelForTokenClassification.from_pretrained('hfl/chinese-macbert-base', num_labels=len(label_list))

Some weights of the model checkpoint at hfl/chinese-macbert-base were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized f

In [89]:
model.config.num_labels

7

In [76]:
seqeval = evaluate.load('seqeval')
seqeval

Downloading builder script: 100%|██████████| 6.34k/6.34k [00:00<00:00, 5.32MB/s]


EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [114]:
import numpy as np
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
def eval_metric(pred):
    # (2319, 128, 7)
    # (2319, 128)
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    true_predictions = []
    true_labels = []
    for prediction, label in zip(predictions, labels):
        true_prediction = []
        true_label = []
        for p, l in zip(prediction, label):
            if l != -100:
                true_prediction.append(label_list[p])
                true_label.append(label_list[l])
        true_predictions.append(true_prediction)
        true_labels.append(true_label)
    res = seqeval.compute(predictions=true_predictions, 
                          references=true_labels, 
                          mode='strict', 
                          scheme='IOB2')
    return {'f1': res['overall_f1']}
    

In [115]:
args = TrainingArguments(output_dir='models_for_ner',
                         per_device_train_batch_size=64, 
                         per_device_eval_batch_size=128,
                         evaluation_strategy='epoch',
                         save_strategy='epoch',
                         metric_for_best_model='f1',
                         load_best_model_at_end=True,
                         logging_steps=50,
                         use_mps_device=True)

In [116]:
from transformers import DataCollatorForTokenClassification
trainer = Trainer(model=model, 
                  args=args,
                  train_dataset=processed_ner_data['train'],
                  eval_dataset=processed_ner_data['validation'],
                  compute_metrics=eval_metric,
                  data_collator=DataCollatorForTokenClassification(tokenizer))

In [117]:
trainer.train()

  0%|          | 3/981 [05:59<32:33:06, 119.82s/it]


  0%|          | 3/981 [00:24<2:05:26,  7.70s/it]

[A[A                                         
                                                 
  0%|          | 3/981 [00:24<2:05:26,  7.70s/it]

{'loss': 0.0002, 'learning_rate': 4.984709480122324e-05, 'epoch': 0.01}




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A                                         


[A[A[A                                      
                                                 
[A

  0%|          | 3/981 [00:45<2:05:26,  7.70s/it]

[A[A

{'eval_loss': 0.023468000814318657, 'eval_f1': 0.8798155181768855, 'eval_runtime': 21.3946, 'eval_samples_per_second': 108.392, 'eval_steps_per_second': 0.888, 'epoch': 0.01}


  1%|          | 6/981 [01:09<3:00:59, 11.14s/it]

[A[A                                         
                                                 
  1%|          | 6/981 [01:09<3:00:59, 11.14s/it]

{'loss': 0.0026, 'learning_rate': 4.9694189602446484e-05, 'epoch': 0.02}




[A[A

[A[A

KeyboardInterrupt: 