In [1]:
import numpy as np
from datasets import load_dataset, DatasetDict, Dataset
from evaluate import load
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer

# Fine-Tuning BERT for Named Entity Recognition

This notebook covers fine-tuning a pretrained AI-Forever RuBERT model for Named Entity Recognition (NER) on the FactRuEval-2016, CoNLL-2003, Collection3 and BC5CDR datasets. 

We will use Hugging Face's implementations of BERT and Trainer to fine-tune a model to perform NER. The key steps are:

1. Prepare training data and map labels  
2. Load pretrained BERT model and tokenizer
3. Define training arguments and trainer
4. Fine-tune model on training data 

In [2]:
def get_dataset_dict(dataset):
    ids, tokens, lengths, ner_tags = [dict() for _ in range(4)]
    for key in ['train', 'validation', 'test']:
        data = dataset[key]['data'][0]
        ids[key], tokens[key], lengths[key], ner_tags[key] = ([] for _ in range(4))
        for item in data:
            ids[key].append(item["id"])
            tokens[key].append(item["tokens"])
            lengths[key].append(item["length"])
            ner_tags[key].append(item["ner_tags"])

    dataset_dict = DatasetDict({
        "train": Dataset.from_dict({
            "id": ids["train"],
            "tokens": tokens["train"],
            "length": lengths["train"],
            "ner_tags": ner_tags["train"]
        }), 
        "validation": Dataset.from_dict({
            "id": ids["validation"],
            "tokens": tokens["validation"],
            "length": lengths["validation"],
            "ner_tags": ner_tags["validation"]
        }),
        "test": Dataset.from_dict({
            "id": ids["test"],
            "tokens": tokens["test"],
            "length": lengths["test"],
            "ner_tags": ner_tags["test"]
        })
    }) 
    return dataset_dict

In [3]:
def get_dataset(dataset_name):
    # Load a dataset using the 'datasets' library.
    dataset = load_dataset(dataset_name, trust_remote_code=True)
    if dataset_name == 'gusevski/factrueval2016':
        dataset = get_dataset_dict(dataset)
    return dataset
    


In [4]:
def get_label_names(dataset):
    try:
        return dataset['train'].features['ner_tags'].feature.names        
    except:
        return ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']



In [5]:
def align_target(labels, word_ids):
    # Define a mapping from beginning (B-) labels to inside (I-) labels
    begin2inside = {
        1: 2,  # B-LOC -> I-LOC
        3: 4,  # B-MISC -> I-MISC
        5: 6,  # B-ORG -> I-ORG
        7: 8    # B-PER -> I-PER
    }

    # Initialize an empty list to store aligned labels and a variable to track the last word
    align_labels = []
    last_word = None

    # Iterate through the word_ids
    for word in word_ids:
        if word is None:
            label = -100  # Set label to -100 for None word_ids
        elif word != last_word:
            label = labels[word]  # Use the label corresponding to the current word_id
        else:
            label = labels[word]
            # Change B- to I- if the previous word is the same
            if label in begin2inside:
                label = begin2inside[label]  # Map B- to I-

        # Append the label to the align_labels list and update last_word
        align_labels.append(label)
        last_word = word

    return align_labels

In [6]:
def tokenize_fn(batch):
    # Tokenize the input batch
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)

    # Extract the labels batch from the input batch
    labels_batch = batch['ner_tags']

    # Initialize a list to store aligned targets for each example in the batch
    aligned_targets_batch = []

    # Iterate through each example and align the labels
    for i, labels in enumerate(labels_batch):
        # Extract the word_ids for the current example
        word_ids = tokenized_inputs.word_ids(i)

        # Use the align_target function to align the labels
        aligned_targets_batch.append(align_target(labels, word_ids))

    # Add the aligned labels to the tokenized inputs under the key "labels"
    tokenized_inputs["labels"] = aligned_targets_batch

    # Return the tokenized inputs, including aligned labels
    return tokenized_inputs

In [7]:
# Function to compute evaluation metrics from model logits and true labels
def compute_metrics(logits, labels, label_names):
    metric = load("seqeval")
    # Unpack the logits and labels
  
    # Get predictions from the logits
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens)
    str_labels = [
        [label_names[t] for t in label if t!=-100] for label in labels
    ]
  
    str_preds = [
        [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = metric.compute(predictions=str_preds, references=str_labels)
  
    # Extract key metrics
    return results["overall_f1"]

In [8]:
# Define the checkpoint you want to use for the tokenizer.
checkpoint = "ai-forever/ruBert-base"

# Create a tokenizer instance by loading the pre-trained checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Create a DataCollatorForTokenClassification object
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [9]:
# Configure training arguments using TrainigArguments class

training_args = TrainingArguments(
    # Location to save fine-tuned model 
    output_dir = f"fine_tuned_models/{checkpoint}",

    # Evaluate each epoch
    eval_strategy = "epoch",

    # Learning rate for Adam optimizer
    learning_rate = 2e-5, 
  
    # Batch sizes for training and evaluation
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    
    # Number of training epochs
    num_train_epochs = 3,

    # L2 weight decay regularization
    weight_decay = 0.01
)

In [10]:
def train_model(tokenized_dataset, label_names, id2label, label2id):
    # Initialize model object with pretrained weights
    model = AutoModelForTokenClassification.from_pretrained(
        checkpoint,
        
        # Pass in label mappings
        id2label=id2label,  
        label2id=label2id
    )
    
    trainer = Trainer(
    # Model to train
        model=model, 
      
        # Training arguments
        args=training_args,
    
        # Training and validation datasets
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
    
        # Tokenizer
        tokenizer=tokenizer,
    
        # Data collator
        data_collator=data_collator 
    )
    trainer.train()
    predictions = trainer.predict(tokenized_dataset['test'])
    return compute_metrics(predictions.predictions, tokenized_dataset['test']['labels'], label_names)
    

In [11]:
datasets = ['gusevski/factrueval2016',
           'RCC-MSU/collection3',
           'conll2003',
           'ghadeermobasher/BC5CDR-Chemical-Disease']

In [12]:
print(checkpoint)
for dataset_name in datasets:
    dataset = get_dataset(dataset_name)
    label_names = get_label_names(dataset)
    id2label = {k: v for k, v in enumerate(label_names)} 
    label2id = {v: k for k, v in enumerate(label_names)}
    tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset['train'].column_names)
    f1 = train_model(tokenized_dataset, label_names, id2label, label2id)
    print(f'Dataset: {dataset_name}, f1: {f1}')

ai-forever/ruBert-base


Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/7746 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2582 [00:00<?, ? examples/s]

Map:   0%|          | 0/2582 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,No log,0.061136
2,0.155900,0.050188
3,0.039500,0.054331


Dataset: gusevski/factrueval2016, f1: 0.9463962332488228


Map:   0%|          | 0/9301 [00:00<?, ? examples/s]

Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1773,0.059841
2,0.0521,0.055693
3,0.0314,0.053228


Dataset: RCC-MSU/collection3, f1: 0.9433494483699021


Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3635,0.150066
2,0.1236,0.134317
3,0.0745,0.134992


Dataset: conll2003, f1: 0.7870956551903564


Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/4561 [00:00<?, ? examples/s]

Map:   0%|          | 0/4582 [00:00<?, ? examples/s]

Map:   0%|          | 0/4798 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.186291
2,0.242400,0.16555
3,0.242400,0.174239


Dataset: ghadeermobasher/BC5CDR-Chemical-Disease, f1: 0.7274685465315514
