In [None]:
# 📘 Fine-Tune & Compare Multilingual Models for Amharic NER

import time
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          TrainingArguments, Trainer)
from sklearn.metrics import classification_report
import evaluate

# ✅ Define models to compare
model_names = [
    "xlm-roberta-base",
    "bert-base-multilingual-cased",
    "distilbert-base-multilingual-cased",
    "Davlan/afroxlmr-base",
    "Davlan/bert-base-amharic"
]

# ✅ Load CoNLL-formatted dataset

def load_conll_data(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split()
                if len(parts) == 2:
                    token, tag = parts
                    sentence.append(token)
                    label.append(tag)
    if sentence:  # Append last sentence if file doesn't end with newline
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

sentences, tags = load_conll_data("amharic_ner_conll.txt")

label_list = sorted(list(set(tag for tag_seq in tags for tag in tag_seq)))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# ✅ Tokenize and align labels

def tokenize_and_align_labels(tokenizer, sentences, tags):
    tokenized_inputs = tokenizer(sentences, is_split_into_words=True, truncation=True, padding=True)
    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return dict(tokenized_inputs)

# ✅ Convert to Hugging Face Dataset
raw_dataset = Dataset.from_pandas(pd.DataFrame({"tokens": sentences, "ner_tags": tags}))

# ✅ Evaluation metric
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [[id2label[p] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(predictions, labels)]
    true_labels = [[id2label[l] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(predictions, labels)]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# ✅ Loop through models and train/evaluate
results_summary = []

for model_name in model_names:
    print(f"\n\n🔄 Fine-tuning model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_dataset = raw_dataset.map(lambda x: tokenize_and_align_labels(tokenizer, x["tokens"], x["ner_tags"]), batched=True)

    if len(tokenized_dataset) == 0:
        print(f"⚠️ Skipping {model_name} — dataset mapping returned 0 samples.")
        continue

    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.split('/')[-1]}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time

    metrics = trainer.evaluate()
    metrics["model"] = model_name
    metrics["train_time_sec"] = training_time
    results_summary.append(metrics)

# ✅ Results Summary Table
results_df = pd.DataFrame(results_summary)
print("\n📊 Model Comparison Summary:")
print(results_df.sort_values("f1", ascending=False))
