# Define Training + Evaluation Function

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer,
    DataCollatorForTokenClassification
)
import numpy as np
!pip install evaluate

from evaluate import load
import time

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    metric = load("seqeval")
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def train_and_evaluate(model_checkpoint, dataset, label_list, id_to_label):
    
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_list)
    )
    label_to_id = {label: i for i, label in enumerate(label_list)} 
    # Tokenize and align
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label[word_idx] if label[word_idx] != -100 else -100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

    args = TrainingArguments(
        output_dir=f"./results/{model_checkpoint.replace('/', '_')}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        save_total_limit=1,
        save_strategy="no"
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
            for pred, label in zip(predictions, labels)
        ]
        true_labels = [
            [id_to_label[l] for (p, l) in zip(pred, label) if l != -100]
            for pred, label in zip(predictions, labels)
        ]

        metric = load("seqeval")
        results = metric.compute(predictions=true_predictions, references=true_labels)

        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    # Initialize Trainer
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"Training {model_checkpoint}...")
    start = time.time()
    trainer.train()
    end = time.time()
    eval_results = trainer.evaluate()
    eval_results["training_time_sec"] = end - start
    return model_checkpoint, eval_results




# Run Comparisons

In [10]:
from datasets import load_dataset
dataset = load_dataset("wikiann", "am")
label_list = dataset["train"].features["ner_tags"].feature.names
id_to_label = {i: label for i, label in enumerate(label_list)}

models = [
    "xlm-roberta-base",
    "Davlan/distilmBERT-base-multilingual-cased-ner-hrl",
    "bert-base-multilingual-cased"
]

results = []
for model_ckpt in models:
    model_name, metrics = train_and_evaluate(model_ckpt, dataset, label_list, id_to_label)
    results.append((model_name, metrics))


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

KeyError: 5

# Compare & Select

In [None]:
import pandas as pd

df = pd.DataFrame([
    {
        "Model": name,
        "F1": round(metrics["f1"], 3),
        "Precision": round(metrics["precision"], 3),
        "Recall": round(metrics["recall"], 3),
        "Accuracy": round(metrics["accuracy"], 3),
        "Training Time (s)": round(metrics["training_time_sec"], 2)
    }
    for name, metrics in results
])

df.sort_values("F1", ascending=False)
