# Installing Required Dependencies

In [None]:
! pip install seqeval

In [79]:
import datasets
from datasets import load_dataset, load_metric

from transformers import (
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)

from seqeval.metrics import classification_report
import numpy as np


# Dataset Preprocessing

In [2]:
# Load CoNLL-2003 dataset
dataset = load_dataset('conll2003', trust_remote_code=True)

# Split dataset
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [51]:
ner_tags = dataset["train"].features["ner_tags"].feature.names
print("NER Tags: ", ner_tags)

NER Tags:  ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [23]:
# visualizing the data
original_sample = dataset['train'][0]
print("Sample Tokens" , original_sample['tokens'])
print("Sample Tags Indecies" , original_sample['ner_tags'])

mapped_ner_tags = [ner_tags[index] for index in original_sample['ner_tags']]
print("Sample NER Tags" , mapped_ner_tags)

Sample Tokens ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Sample Tags Indecies [3, 0, 7, 0, 0, 0, 7, 0, 0]
Sample NER Tags ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


## NER Tokenization

In [61]:
# Tokenizes input examples and aligns the NER tags with the tokenized inputs.
def tokenize_and_align_labels(examples, tokenizer):
    
    # Tokenize the input tokens
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    
    # Loop through each example and its corresponding NER tags
    for i, label in enumerate(examples[f'ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        # Get the word IDs from the tokenized inputs for the current example
        for word_idx in word_ids:
            if word_idx is None:
                # If word_idx is None, it's a special token (e.g., [CLS], [SEP]), so we ignore it by setting -100
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # If the token is part of the same word (sub-word token)
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Evaluation Metrics

In [81]:
metric = load_metric("seqeval", trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    true_labels = [
        [id2label[str(l)] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[str(p)] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [76]:
# setting id2labels for easier visulaization during evaluation
id2label = {str(i): label for i, label in enumerate(ner_tags)}
label2id = {v: k for k, v in id2label.items()}

# BERT

## BERT Tokenization

In [62]:
# Load tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

# Tokenize datasets for BERT
train_dataset_bert = train_dataset.map(lambda examples: tokenize_and_align_labels(examples, bert_tokenizer), batched=True)
val_dataset_bert = val_dataset.map(lambda examples: tokenize_and_align_labels(examples, bert_tokenizer), batched=True)
test_dataset_bert = test_dataset.map(lambda examples: tokenize_and_align_labels(examples, bert_tokenizer), batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [63]:
train_dataset_bert

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14041
})

In [64]:
print(bert_tokenizer.convert_ids_to_tokens(train_dataset_bert[0]['input_ids']))
print(train_dataset_bert[0]['labels'])

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[-100, 3, 0, 7, 0, 0, 0, 7, 0, -100, 0, -100]


# BERT Model & Training Arguments

In [77]:
# Load DistilBERT model
bert_model = AutoModelForTokenClassification.from_pretrained('distilbert-base-cased', 
                                                             num_labels=len(ner_tags),
                                                             id2label=id2label,
                                                             label2id=label2id
                                                            )

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
# Define training arguments with early stopping
bert_training_args = TrainingArguments(
    output_dir='./bert',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [65]:
bert_data_collator = DataCollatorForTokenClassification(tokenizer=bert_tokenizer)

In [85]:
# Initialize Trainer with early stopping callback
bert_trainer = Trainer(
    model=bert_model,
    args=bert_training_args,
    train_dataset=train_dataset_bert,
    eval_dataset=val_dataset_bert,
    data_collator=bert_data_collator,
    compute_metrics = compute_metrics,
    # early stopping of 2 non-improving epochs
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

## BERT Training

In [86]:
# Train!
bert_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.053233,0.902183,0.911141,0.90664,0.984619
2,0.110800,0.043346,0.93945,0.937395,0.938421,0.989233
3,0.026800,0.045173,0.934746,0.942612,0.938663,0.989136
4,0.010700,0.045328,0.933788,0.944631,0.939178,0.989525


TrainOutput(global_step=1756, training_loss=0.04326418232266072, metrics={'train_runtime': 191.9531, 'train_samples_per_second': 365.74, 'train_steps_per_second': 11.435, 'total_flos': 776580879783240.0, 'train_loss': 0.04326418232266072, 'epoch': 4.0})

In [87]:
# Evaluate the model on the test set
results = bert_trainer.evaluate(test_dataset_bert)

print(f"Test set results: {results}")

Test set results: {'eval_loss': 0.10282840579748154, 'eval_precision': 0.8957816377171216, 'eval_recall': 0.8948300283286119, 'eval_f1': 0.8953055801594332, 'eval_accuracy': 0.9797781845590611, 'eval_runtime': 4.954, 'eval_samples_per_second': 697.014, 'eval_steps_per_second': 21.801, 'epoch': 4.0}


## BERT Evaluation

In [92]:
# Get predictions and labels for the classification report
predictions, labels, _ = bert_trainer.predict(test_dataset_bert)
predictions = np.argmax(predictions, axis=-1)

true_labels = [
    [id2label[str(l)] for l in label if l != -100]
    for label in labels
]
true_predictions = [
    [id2label[str(p)] for p, l in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Print classification report
print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

         LOC       0.93      0.92      0.92      1668
        MISC       0.78      0.78      0.78       702
         ORG       0.86      0.88      0.87      1661
         PER       0.96      0.94      0.95      1617

   micro avg       0.90      0.89      0.90      5648
   macro avg       0.88      0.88      0.88      5648
weighted avg       0.90      0.89      0.90      5648



# RoBERTa

## RoBERTa Tokenization

In [94]:
roberta_tokenizer = AutoTokenizer.from_pretrained('distilroberta-base', add_prefix_space=True)

# Tokenize datasets for roberta
train_dataset_roberta = train_dataset.map(lambda examples: tokenize_and_align_labels(examples, roberta_tokenizer), batched=True)
val_dataset_roberta = val_dataset.map(lambda examples: tokenize_and_align_labels(examples, roberta_tokenizer), batched=True)
test_dataset_roberta = test_dataset.map(lambda examples: tokenize_and_align_labels(examples, roberta_tokenizer), batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

# RoBERTa Model & Training Arguments

In [97]:
# Load DistilBERT model
roberta_model = AutoModelForTokenClassification.from_pretrained('distilroberta-base', 
                                                             num_labels=len(ner_tags),
                                                             id2label=id2label,
                                                             label2id=label2id
                                                            )

model.safetensors:  25%|##5       | 83.9M/331M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
roberta_data_collator = DataCollatorForTokenClassification(tokenizer=roberta_tokenizer)

In [100]:
# Define training arguments with early stopping
roberta_training_args = TrainingArguments(
    output_dir='./roberta',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [102]:
# Initialize Trainer with early stopping callback
roberta_trainer = Trainer(
    model=roberta_model,
    args=roberta_training_args,
    train_dataset=train_dataset_roberta,
    eval_dataset=val_dataset_roberta,
    data_collator=roberta_data_collator,
    compute_metrics = compute_metrics,
    # early stopping of 2 non-improving epochs
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

## RoBERTa Training

In [103]:
# Train!
roberta_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.039124,0.933467,0.942107,0.937767,0.989778
2,0.104900,0.034562,0.946005,0.952373,0.949178,0.991394
3,0.026800,0.035713,0.946059,0.953383,0.949707,0.991433
4,0.013900,0.035476,0.948028,0.954729,0.951367,0.991725


TrainOutput(global_step=1756, training_loss=0.04291601740417828, metrics={'train_runtime': 197.4521, 'train_samples_per_second': 355.555, 'train_steps_per_second': 11.117, 'total_flos': 755343235622430.0, 'train_loss': 0.04291601740417828, 'epoch': 4.0})

## RoBERTa Evaluation

In [104]:
# Evaluate the model on the test set
results = roberta_trainer.evaluate(test_dataset_roberta)

print(f"RoBERTa Test set results: {results}")

RoBERTa Test set results: {'eval_loss': 0.1058579832315445, 'eval_precision': 0.8956491592997053, 'eval_recall': 0.9148371104815864, 'eval_f1': 0.9051414557239205, 'eval_accuracy': 0.9810487778615269, 'eval_runtime': 4.8622, 'eval_samples_per_second': 710.177, 'eval_steps_per_second': 22.212, 'epoch': 4.0}


In [105]:
# Get predictions and labels for the classification report
predictions, labels, _ = roberta_trainer.predict(test_dataset_roberta)
predictions = np.argmax(predictions, axis=-1)

true_labels = [
    [id2label[str(l)] for l in label if l != -100]
    for label in labels
]
true_predictions = [
    [id2label[str(p)] for p, l in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Print classification report
print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

         LOC       0.91      0.94      0.92      1668
        MISC       0.78      0.80      0.79       702
         ORG       0.88      0.90      0.89      1661
         PER       0.95      0.95      0.95      1617

   micro avg       0.90      0.91      0.91      5648
   macro avg       0.88      0.90      0.89      5648
weighted avg       0.90      0.91      0.91      5648

