# Fine-tune BERT-based models from Hugging Face on CoNLL-2002 Spanish NER data

In this notebook, you will fine-tune and evaluate multiple BERT-based models on CoNLL-2002 Spanish NER data.

Code for loading and preprocessing the data is provided. You will provide code for training and evaluation using Hugging Face Trainer or PyTorch.

Please copy this notebook and name it `{pitt email id}_hw4_bert_ner.ipynb`.

**Note**: Please run on GPU by going to Runtime > Change Runtime Type > T4 GPU

This notebook is based on:
* https://github.com/laxmimerit/NLP-Tutorials-with-HuggingFace/blob/main/NLP_with_HuggingFace_Tutorial_2_NER_Training.ipynb  
* https://skimai.com/how-to-fine-tune-bert-for-named-entity-recognition-ner/

# Set up environment, preprocess data

In [1]:
# Download and install needed Hugging Face packages

!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
# Load dataset, which contains splits for training, validation (dev), and test

import pandas as pd
from datasets import load_dataset

data = load_dataset('conll2002', 'es')
data

Downloading builder script:   0%|          | 0.00/9.23k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/713k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/138k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1518 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 8324
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1518
    })
})

In [3]:
# Examine the tagset. Note the BIO framework with 4 possible types

tags = data['train'].features['ner_tags'].feature

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}
index2tag

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [4]:
# Put human-readable NER tags in data

def create_tag_names(batch):
  tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
  return tag_name

data = data.map(create_tag_names)

Map:   0%|          | 0/8324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

In [5]:
# Take a look at the data
pd.DataFrame(data['train'])[['tokens', 'ner_tags', 'ner_tags_str']].head(3)

Unnamed: 0,tokens,ner_tags,ner_tags_str
0,"[Melbourne, (, Australia, ), ,, 25, may, (, EF...","[5, 0, 5, 0, 0, 0, 0, 0, 3, 0, 0]","[B-LOC, O, B-LOC, O, O, O, O, O, B-ORG, O, O]"
1,[-],[0],[O]
2,"[El, Abogado, General, del, Estado, ,, Daryl, ...","[0, 1, 2, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, ...","[O, B-PER, I-PER, I-PER, I-PER, O, B-PER, I-PE..."


## Metrics
Load NER-specific evaluation metrics

In [6]:
!pip install seqeval
!pip install evaluate

import evaluate
import numpy as np

metric = evaluate.load('seqeval')
ner_feature = data['train'].features['ner_tags']
label_names = ner_feature.feature.names
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=6852ecab1aac807ae06621f715b847f8ec6e4f06810f6dc4b1b25dc5a745ca48
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m e

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

# Fine-tune models
This section is where you choose models and fill in parts of the code to do fine-tuning.

You need to fine-tune at least 2 pretrained models from the Hugging Face platform on the preprocessed CoNLL-2002 Spanish data:
* One BERT-based model pretrained with a regular masked language modeling (MLM) objective on a Spanish corpus. Examples: `PlanTL-GOB-ES/roberta-base-bne`, `chriskhanhtran/spanberta`
* One model pretrained to perform NER on another language, such as English. Models pretrained on the CoNLL-2003 dataset often work. Examples: `elastic/distilbert-base-cased-finetuned-conll03-english`, `dbmdz/bert-bert-cased-finetuned-conll03-english`

You'll want to make sure whatever pretrained model is cased, which contains valuable information for NER.

In [16]:
# FILL IN which model you are fine-tuning and assign the name of it to the `pretrained_model` variable
# BERT-based MLM spanish model - chriskhanhtran/spanberta
# NER english model - elastic/distilbert-base-cased-finetuned-conll03-english
pretrained_model = 'elastic/distilbert-base-cased-finetuned-conll03-english'

In [17]:
# Tokenize the data with the pretrained model's tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True, add_prefix_space=True)

def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word=None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label%2==1:
        label = label + 1
      new_labels.append(label)

  return new_labels

def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_tags']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)
tokenized_datasets

Downloading (…)okenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/954 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/8324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8324
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1518
    })
})

In [18]:
# Build a data collator to handle batching

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Train (fine-tune) the model

In [21]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}
print(id2label)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [22]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(pretrained_model,
                                                    id2label=id2label,
                                                    label2id=label2id)

## FILL IN code to train
Provide code to train (fine-tune) the pretrained model.
 You can use Hugging Face Trainer class or use any other package you want, such as PyTorch.

 See the [Hugging Face Trainer user guide](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt) or use any other online examples/resources you find online.

In [23]:
# Training code here
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1031,0.137604,0.690581,0.756434,0.722009,0.960615
2,0.0607,0.132891,0.713867,0.783088,0.746877,0.964555
3,0.0301,0.14302,0.743174,0.800551,0.770796,0.96803


TrainOutput(global_step=3123, training_loss=0.07492402048016297, metrics={'train_runtime': 373.5381, 'train_samples_per_second': 66.853, 'train_steps_per_second': 8.361, 'total_flos': 771434741985264.0, 'train_loss': 0.07492402048016297, 'epoch': 3.0})

# Evaluate the fine-tuned model

## FILL IN code to evaluate performance of the model on the test set
Provide code to evaluate the pretrained model on the `test` portion of the dataset (`tokenized_datasets['test']`)

You'll need the F1 score for your report.
This is calculated automatically if you passed the `compute_metrics` function to the `Trainer` class.

In [None]:
# Testing code here

Hooray, you're done evaluating a model!

Feel free to restart the runtime and evaluate another one, or test that model on an example in the section below (which you'll need to do for at least one model).

# Test the model on an example
Code is provided here to test your fine-tuned classifier on an example sentence.

You will need to fill in the path to a checkpoint of your fine-tuned model if it has been saved somewhere. Or feel free to run your model some other way on the example sentence.

You will need the output of running at least one of your models on the example sentence for your report.

In [26]:
# Test performance on an example

from transformers import pipeline

bert_mlm_spanish_checkpoint = '/content/test-trainer-bert-mlm-spanish/checkpoint-3000' # BERT-based MLM spanish model
ner_english_checkpoint_path = '/content/test-trainer-ner-english/checkpoint-3000' # NER english model
checkpoint = ner_english_checkpoint_path

token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

test_sentence = "Mi nombre is Miguel Salgado. Trabajo en la Universidad de Pittsburgh y vivo en Pittsburgh."
token_classifier(test_sentence)

[{'entity_group': 'PER',
  'score': 0.99591243,
  'word': 'Miguel Salgado. Trabajo',
  'start': 13,
  'end': 36},
 {'entity_group': 'ORG',
  'score': 0.8968439,
  'word': 'Universidad de Pittsburgh',
  'start': 43,
  'end': 68},
 {'entity_group': 'LOC',
  'score': 0.9904759,
  'word': 'Pittsburgh',
  'start': 79,
  'end': 89}]