### 0. Install and load library

In [None]:
#Restart kernel after installation: Runtime -> Restart runtime

#!pip install -U transformers sentencepiece datasets seqeval

In [27]:
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
import numpy as np

In [4]:
task = 'ner'
model_checkpoint = 'distilbert-base-uncased'
batch_size = 8

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric('seqeval')

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

### 1. Load and preprocess dataset 

In [3]:
dataset = load_dataset("conll2003")

# dataset object itself is DatasetDict, which contains train/val/test set
dataset

Downloading:   0%|          | 0.00/2.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 4.63 MiB, generated: 9.78 MiB, post-processed: Unknown size, total: 14.41 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6...


Downloading:   0%|          | 0.00/650k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/146k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
# take a quick look on one training sample
dataset['train'][0]

{'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'id': '0',
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.']}

In [12]:
label_list = dataset['train'].features['ner_tags'].feature.names

id2label = {idx:label for idx, label in enumerate(label_list)}
label2id = {label:idx for idx, label in id2label.items()}

print(f"Label list: {label_list}")
print(f"id2label: {id2label}")
print(f"Num of label: {len(label_list)}")

Label list: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
Num of label: 9


In [25]:
example = dataset['train'][420]
print(f"Original sentence token list: {example['tokens']}")

tokenized_input = tokenizer(example['tokens'], is_split_into_words=True)
print(f"tokenized input ids: {tokenized_input['input_ids']}")

tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
print(f"tokenized input: {tokens}")

# to keep track with subtokens from the same word: Huggingface -> Hugg, ##ing, ##face
# in this case: Lathwell -> la, ##th, ##well
print(f"tokenized input word_ids: {tokenized_input.word_ids()}")

# Lathwell is split into 3 subtokens and their word_ids are 4
# We can re-use that word_ids to get the correct ner tag
# in this case the fourth idx of ner_tags ~= 2 
print(f"Orignal ner tag: {example['ner_tags']}")

Original sentence token list: ['Somerset', '236-4', '(', 'M.', 'Lathwell', '85', ')', '.']
tokenized input ids: [101, 9198, 23593, 1011, 1018, 1006, 1049, 1012, 2474, 2705, 4381, 5594, 1007, 1012, 102]
tokenized input: ['[CLS]', 'somerset', '236', '-', '4', '(', 'm', '.', 'la', '##th', '##well', '85', ')', '.', '[SEP]']
tokenized input word_ids: [None, 0, 1, 1, 1, 2, 3, 3, 4, 4, 4, 5, 6, 7, None]
Orignal ner tag: [3, 0, 0, 1, 2, 0, 0, 0]


In [33]:
# if True: label subtokens with the ner tag
# if False: label the first subtoken only and label the rest of subtoken as -100
label_all_token = True

def tokenize_and_align_labels(examples):
    # examples is batch of inputs 
    tokenized_input = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []

    for sent_idx, sent_label in enumerate(examples[f'{task}_tags']):

        word_ids = tokenized_input.word_ids(batch_index=sent_idx)
        previous_word_idx = None

        # aligned label ids for current sent idx
        label_ids = []
        for word_idx in word_ids:
            # for special token: [CLS] [SEP]
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(sent_label[word_idx])
            else:
                label_ids.append(sent_label[word_idx] if label_all_token else -100)

            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_input['labels'] = labels
    return tokenized_input


Then we will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example). There is a data collator for this task in the Transformers library, that not only pads the inputs, but also the labels

In [31]:
print(f"example {task} tag: {example[f'{task}_tags']}")

labels = [label_list[i] for i in example[f'{task}_tags']]
print(f"labels: {labels}")

metric.compute(predictions=[labels], references=[labels])

example ner tag: [3, 0, 0, 1, 2, 0, 0, 0]
labels: ['B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O']


{'ORG': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'PER': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

So we will need to do a bit of post-processing on our predictions:
- select the predicted index (with the maximum logit) for each token
- convert it to its string label
- ignore everywhere we set a label of -100

The following function does all this post-processing on the result of `Trainer.evaluate` (which is a namedtuple containing predictions and labels) before applying the metric:

In [32]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

#### Tokenized dataset

In [34]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

### 2. Define TrainingArguments and Trainer

In [35]:
# https://huggingface.co/transformers/main_classes/configuration.html#transformers.PretrainedConfig

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [38]:
args = TrainingArguments(
    f"tesk-{task}",
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    logging_strategy='epoch',
    logging_dir='./logs'
)

In [40]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [41]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, chunk_tags, pos_tags.
***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1442,0.063254,0.92183,0.928739,0.925272,0.98262
2,0.043,0.062838,0.930212,0.937913,0.934046,0.984018
3,0.0237,0.064939,0.932837,0.940038,0.936424,0.984495


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, chunk_tags, pos_tags.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to tesk-ner/checkpoint-1756
Configuration saved in tesk-ner/checkpoint-1756/config.json
Model weights saved in tesk-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in tesk-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in tesk-ner/checkpoint-1756/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, chunk_tags, pos_tags.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to tesk-ner/checkpoint-3512
Configuration saved in tesk-ner/checkpoint-3512/config.json
Model weights saved in tes

TrainOutput(global_step=5268, training_loss=0.07031470061977795, metrics={'train_runtime': 317.9564, 'train_samples_per_second': 132.48, 'train_steps_per_second': 16.568, 'total_flos': 447058853741934.0, 'train_loss': 0.07031470061977795, 'epoch': 3.0})

In [42]:
# predicts on test set

preds = trainer.predict(tokenized_dataset['test'])

preds = (preds[0], preds[1])

results = compute_metrics(preds)
results


The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, chunk_tags, pos_tags.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 8


{'accuracy': 0.9744838063669636,
 'f1': 0.8893614507649595,
 'precision': 0.8853345877925438,
 'recall': 0.8934251127462616}

In [44]:
# differet way to get test results
predictions, labels, _ = trainer.predict(tokenized_dataset['test'])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, chunk_tags, pos_tags.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 8


{'LOC': {'f1': 0.8998395599358241,
  'number': 2124,
  'precision': 0.8767306833407771,
  'recall': 0.9241996233521658},
 'MISC': {'f1': 0.7470376094796497,
  'number': 996,
  'precision': 0.7671957671957672,
  'recall': 0.7279116465863453},
 'ORG': {'f1': 0.8660936007640878,
  'number': 2588,
  'precision': 0.8564412542500944,
  'recall': 0.8759659969088099},
 'PER': {'f1': 0.9547309833024118,
  'number': 2718,
  'precision': 0.9629491017964071,
  'recall': 0.9466519499632082},
 'overall_accuracy': 0.9744838063669636,
 'overall_f1': 0.8893614507649595,
 'overall_precision': 0.8853345877925438,
 'overall_recall': 0.8934251127462616}

In [45]:
# save model
trainer.save_model("./models")

Saving model checkpoint to ./models
Configuration saved in ./models/config.json
Model weights saved in ./models/pytorch_model.bin
tokenizer config file saved in ./models/tokenizer_config.json
Special tokens file saved in ./models/special_tokens_map.json


### 2. Inference using pipeline

In [46]:
!ls models/ 

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
pytorch_model.bin  tokenizer_config.json    training_args.bin


In [47]:
from transformers import pipeline
distilbert_ner = pipeline('ner', model="./models", aggregation_strategy='first')

loading configuration file ./models/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "vocab_size": 30522
}

loading configura

In [48]:
article = """
KUALA LUMPUR - Malaysian Prime Minister Muhyiddin Yassin's party said on Thursday (July 8) that his government would continue to function despite Umno withdrawing its backing. 
Amid uncertainty over whether Tan Sri Muhyiddin continues to command majority support without Umno, the largest party in the Perikatan Nasional (PN) ruling pact, 
his Parti Pribumi Bersatu Malaysia said Umno's decision "had no effect on the workings of government"."""


results = distilbert_ner(article)

In [49]:
print("Predicted:")
for tag in results:
    print(f"{tag['entity_group']:<5} as {tag['word']}")


"""
Predicted:
LOC   as kuala lumpur
MISC  as malaysian
PER   as muhyiddin yassin
ORG   as umno
PER   as tan
ORG   as sri
PER   as muhyiddin
ORG   as umno
ORG   as perikatan nasional
ORG   as pn
ORG   as parti pribumi bersatu malaysia
ORG   as umno
"""

Predicted:
LOC   as kuala lumpur
MISC  as malaysian
PER   as muhyiddin yassin
ORG   as umno
PER   as tan
ORG   as sri
PER   as muhyiddin
ORG   as umno
ORG   as perikatan nasional
ORG   as pn
ORG   as parti pribumi bersatu malaysia
ORG   as umno


'\nOutput\n------\n\nPredicted:\nORG   as KUALA LUMPUR\nMISC  as Malaysian\nPER   as Muhyiddin Yassin\nPER   as Umno\nPER   as Tan Sri Muhyiddin\nPER   as Umno\nORG   as Perikatan Nasional\nORG   as PN\nORG   as Parti Pribumi\nPER   as Bersatu\nORG   as Malaysia\nPER   as Umno\n\n'