In [1]:
import numpy as np
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline

# Fine-Tuning BERT for Named Entity Recognition

This notebook covers fine-tuning a pretrained BERT model for Named Entity Recognition (NER) on the CoNLL-2003 dataset. 

We will use Hugging Face's implementations of BERT and Trainer to fine-tune a model to perform NER. The key steps are:

1. Prepare training data and map labels  
2. Load pretrained BERT model and tokenizer
3. Define training arguments and trainer
4. Fine-tune model on training data 
5. Evaluate on validation data

In [2]:
datasets = {
    "universalner/universal_ner": ['ceb_gja', 'zh_gsd', 'zh_gsdsimp', 'zh_pud', 'hr_set', 'da_ddt', 'en_ewt', 'en_pud', 'de_pud', 'pt_bosque', 'pt_pud', 'ru_pud', 'sr_set', 'sk_snk', 'sv_pud', 'sv_talbanken', 'tl_trg', 'tl_ugnayan'],
    "DFKI-SLT/cross_ner" : ['ai', 'conll2003', 'literature', 'music', 'politics', 'science']
}

In [3]:
# Load the CoNLL-2003 dataset using the 'datasets' library.
dataset = load_dataset('conll2003')

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
# Accessing the label names from the 'ner_tags' feature.
label_names = dataset['train'].features['ner_tags'].feature.names

# Create mapping from label ID to label string name
id2label = {k: v for k, v in enumerate(label_names)} 

# Create reverse mapping from label name to label ID
label2id = {v: k for k, v in enumerate(label_names)}

label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
# Define the checkpoint you want to use for the tokenizer.
checkpoint = "google-bert/bert-base-multilingual-cased"

# Create a tokenizer instance by loading the pre-trained checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Create a DataCollatorForTokenClassification object
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [6]:
def align_target(labels, word_ids):
    # Define a mapping from beginning (B-) labels to inside (I-) labels
    begin2inside = {
        1: 2,  # B-LOC -> I-LOC
        3: 4,  # B-MISC -> I-MISC
        5: 6,  # B-ORG -> I-ORG
        7: 8    # B-PER -> I-PER
    }

    # Initialize an empty list to store aligned labels and a variable to track the last word
    align_labels = []
    last_word = None

    # Iterate through the word_ids
    for word in word_ids:
        if word is None:
            label = -100  # Set label to -100 for None word_ids
        elif word != last_word:
            label = labels[word]  # Use the label corresponding to the current word_id
        else:
            label = labels[word]
            # Change B- to I- if the previous word is the same
            if label in begin2inside:
                label = begin2inside[label]  # Map B- to I-

        # Append the label to the align_labels list and update last_word
        align_labels.append(label)
        last_word = word

    return align_labels

In [7]:
def tokenize_fn(batch):
    # Tokenize the input batch
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)

    # Extract the labels batch from the input batch
    labels_batch = batch['ner_tags']

    # Initialize a list to store aligned targets for each example in the batch
    aligned_targets_batch = []

    # Iterate through each example and align the labels
    for i, labels in enumerate(labels_batch):
        # Extract the word_ids for the current example
        word_ids = tokenized_inputs.word_ids(i)

        # Use the align_target function to align the labels
        aligned_targets_batch.append(align_target(labels, word_ids))

    # Add the aligned labels to the tokenized inputs under the key "labels"
    tokenized_inputs["labels"] = aligned_targets_batch

    # Return the tokenized inputs, including aligned labels
    return tokenized_inputs

In [8]:
tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [9]:
# Load the seqeval metric which can evaluate NER and other sequence tasks
metric = load("seqeval")

In [10]:
# Function to compute evaluation metrics from model logits and true labels
def compute_metrics(logits_and_labels):
    # Unpack the logits and labels
    logits, labels = logits_and_labels 
  
    # Get predictions from the logits
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens)
    str_labels = [
        [label_names[t] for t in label if t!=-100] for label in labels
    ]
  
    str_preds = [
        [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = metric.compute(predictions=str_preds, references=str_labels)
  
    # Extract key metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"], 
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]  
    }

In [11]:
# Load pretrained token classification model from Transformers 

# Initialize model object with pretrained weights
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    # "fine_tuned_model",

    # Pass in label mappings
    id2label=id2label,  
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Configure training arguments using TrainigArguments class

training_args = TrainingArguments(
    # Location to save fine-tuned model 
    output_dir = "fine_tuned_model",

    # Evaluate each epoch
    eval_strategy = "epoch",

    # Learning rate for Adam optimizer
    learning_rate = 2e-5, 
  
    # Batch sizes for training and evaluation
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    
    # Number of training epochs
    num_train_epochs = 3,

    # L2 weight decay regularization
    weight_decay = 0.01
)

In [13]:
# Initialize Trainer object for model training

trainer = Trainer(
    # Model to train
    model=model, 
  
    # Training arguments
    args=training_args,

    # Training and validation datasets
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],

    # Tokenizer
    tokenizer=tokenizer,

    # Custom metric function
    compute_metrics=compute_metrics,

    # Data collator
    data_collator=data_collator 
)

In [14]:
tokenized_dataset['test']['labels']

[[-100,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -100],
 [-100, 1, 2, 2, 2, 2, -100],
 [-100, 5, 6, 6, 6, 0, 5, 6, 6, 0, 0, 0, 0, 0, -100],
 [-100,
  5,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -100],
 [-100,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  -100],
 [-100,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -100],
 [-100,
  1,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1822,0.06508,0.918932,0.938573,0.928649,0.983568
2,0.047,0.059015,0.943459,0.949175,0.946309,0.986697
3,0.0242,0.055893,0.947035,0.956917,0.95195,0.988466


TrainOutput(global_step=2634, training_loss=0.0687735889024959, metrics={'train_runtime': 4662.1724, 'train_samples_per_second': 9.035, 'train_steps_per_second': 0.565, 'total_flos': 1070850137309478.0, 'train_loss': 0.0687735889024959, 'epoch': 3.0})

In [16]:
# trainer.save_model('fine_tuned_model')

In [17]:
predictions = trainer.predict(tokenized_dataset['test'])
compute_metrics((predictions.predictions, tokenized_dataset['test']['labels']))

{'precision': 0.8816689466484268,
 'recall': 0.9128895184135978,
 'f1': 0.8970076548364648,
 'accuracy': 0.9721286854794088}