## 1. Load CoNLL‑formatted Dataset

Load from your Drive-mounted `conll.txt`, split into train/val/test (80/10/10).


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.model_selection import train_test_split
import os

def read_conll(path):
    sentences, labels = [], []
    with open(path, 'r', encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                tok, lbl = line.split()
                tokens.append(tok)
                tags.append(lbl)
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

# Make sure to check the file path and ensure the file exists in your Google Drive.
sents, labs = read_conll('/content/drive/MyDrive/amharic_labeled.conll.txt')
train_s, test_s, train_l, test_l = train_test_split(sents, labs, test_size=0.2, random_state=42)
val_s, test_s, val_l, test_l = train_test_split(test_s, test_l, test_size=0.5, random_state=42)
print(f"Train: {len(train_s)}, Val: {len(val_s)}, Test: {len(test_s)}")

Train: 1887, Val: 236, Test: 236


In [None]:
!wget -O '/content/drive/MyDrive/train_conll.txt' 'https://drive.google.com/uc?export=download&id=1cghzOPC_q5rvfpWfTGUTdTPuFmCxJx_k'

--2025-07-23 08:37:21--  https://drive.google.com/uc?export=download&id=1cghzOPC_q5rvfpWfTGUTdTPuFmCxJx_k
Resolving drive.google.com (drive.google.com)... 142.250.141.101, 142.250.141.100, 142.250.141.102, ...
Connecting to drive.google.com (drive.google.com)|142.250.141.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1cghzOPC_q5rvfpWfTGUTdTPuFmCxJx_k&export=download [following]
--2025-07-23 08:37:21--  https://drive.usercontent.google.com/download?id=1cghzOPC_q5rvfpWfTGUTdTPuFmCxJx_k&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.137.132, 2607:f8b0:4023:c03::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.137.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20208845 (19M) [application/octet-stream]
Saving to: ‘/content/drive/MyDrive/train_conll.txt’


2025-07-23 08:37:26 (47.8 MB/s) - ‘

## 2. Convert to 🤗 Dataset format


In [None]:
from datasets import Dataset, ClassLabel, Sequence

def build_dataset(sentences, labels):
    unique_labels = list({l for sub in labels for l in sub})
    unique_labels.sort()
    label2id = {l:i for i,l in enumerate(unique_labels)}
    features = {
        'tokens': sentences,
        'ner_tags': [[label2id[t] for t in seq] for seq in labels]
    }
    ds = Dataset.from_dict(features)
    ds = ds.cast_column('ner_tags', Sequence(feature=ClassLabel(names=unique_labels)))
    return ds, unique_labels

train_ds, label_list = build_dataset(train_s, train_l)
val_ds, _ = build_dataset(val_s, val_l)
test_ds, _ = build_dataset(test_s, test_l)
num_labels = len(label_list)
id2label = {i: l for i, l in enumerate(label_list)}

Casting the dataset:   0%|          | 0/1887 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/236 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/236 [00:00<?, ? examples/s]

## 3. Define tokenization and alignment function


In [None]:
from transformers import AutoTokenizer

def tokenize_and_align(examples, tokenizer):
    tokenized = tokenizer(examples['tokens'], is_split_into_words=True, truncation=True, padding='max_length')
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word:
                label_ids.append(label[word_id])
            else:
                l = label[word_id]
                label_ids.append(l if label_list[l].startswith('I-') else l)
            previous_word = word_id
        labels.append(label_ids)
    tokenized["labels"] = labels
    return tokenized


## 4. Function to train a model with early-stopping and compute metrics


In [None]:
pip install seqeval




In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_labels, true_labels = [], []
    for pr, tr in zip(preds, true):
        pred_labels.append([label_list[p] for (p, l) in zip(pr, tr) if l != -100])
        true_labels.append([label_list[l] for (p, l) in zip(pr, tr) if l != -100])
    return {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels)
    }

def train_model(model_name, output_path):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_tok = train_ds.map(lambda e: tokenize_and_align(e, tokenizer), batched=True)
    val_tok = val_ds.map(lambda e: tokenize_and_align(e, tokenizer), batched=True)

    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, ignore_mismatched_sizes=True)
    args = TrainingArguments(
        output_dir=output_path,
        eval_strategy="epoch", # Changed from evaluation_strategy to eval_strategy
        save_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=10,
        learning_rate=3e-5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorForTokenClassification(tokenizer)
    )
    trainer.train()
    evaluation_results = trainer.evaluate(test_ds.map(lambda e: tokenize_and_align(e, tokenizer), batched=True))
    print(evaluation_results)
    return evaluation_results

Now, let's train the model using the specified models: XLM-Roberta, bert-tiny-amharic, and afroxmlr. We will train each model and evaluate its performance to determine the best one.

In [None]:
print("Training with XLM-Roberta...")
train_model("xlm-roberta-base", "/content/drive/MyDrive/xlm_roberta_output")

Training with XLM-Roberta...


Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/236 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.00767,0.998006,0.986297,0.996703,0.991473
2,No log,0.005731,0.9979,0.990473,0.994065,0.992266
3,0.056000,0.002042,0.999466,0.997036,0.998022,0.997528
4,0.056000,0.001977,0.999502,0.997695,0.999011,0.998353
5,0.001600,0.000982,0.999608,0.997694,0.998681,0.998188


Map:   0%|          | 0/236 [00:00<?, ? examples/s]

{'eval_loss': 0.0008886168361641467, 'eval_accuracy': 0.9996397104582228, 'eval_precision': 0.9980551053484603, 'eval_recall': 0.9990266060999351, 'eval_f1': 0.998540619425977, 'eval_runtime': 6.8392, 'eval_samples_per_second': 34.507, 'eval_steps_per_second': 4.386, 'epoch': 5.0}


{'eval_loss': 0.0008886168361641467,
 'eval_accuracy': 0.9996397104582228,
 'eval_precision': 0.9980551053484603,
 'eval_recall': 0.9990266060999351,
 'eval_f1': 0.998540619425977,
 'eval_runtime': 6.8392,
 'eval_samples_per_second': 34.507,
 'eval_steps_per_second': 4.386,
 'epoch': 5.0}

In [None]:
print("Training with bert-tiny-amharic...")
# Note: This model might require checking the exact model name on Hugging Face Hub
train_model("rasyosef/bert-tiny-amharic", "/content/drive/MyDrive/bert_tiny_amharic_output")

Training with bert-tiny-amharic...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/236 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.562254,0.887461,0.967122,0.704155,0.81495
2,No log,0.277956,0.943816,0.876176,0.799006,0.835814
3,0.697200,0.190071,0.966187,0.906717,0.862692,0.884157
4,0.697200,0.155973,0.972716,0.919234,0.889341,0.90404
5,0.221400,0.145428,0.974125,0.921751,0.893857,0.90759


Map:   0%|          | 0/236 [00:00<?, ? examples/s]

{'eval_loss': 0.1387680321931839, 'eval_accuracy': 0.9760205074956194, 'eval_precision': 0.9309099119440054, 'eval_recall': 0.9039684279763209, 'eval_f1': 0.9172413793103448, 'eval_runtime': 0.7209, 'eval_samples_per_second': 327.352, 'eval_steps_per_second': 41.613, 'epoch': 5.0}


{'eval_loss': 0.1387680321931839,
 'eval_accuracy': 0.9760205074956194,
 'eval_precision': 0.9309099119440054,
 'eval_recall': 0.9039684279763209,
 'eval_f1': 0.9172413793103448,
 'eval_runtime': 0.7209,
 'eval_samples_per_second': 327.352,
 'eval_steps_per_second': 41.613,
 'epoch': 5.0}

In [None]:
print("Training with afroxmlr...")
# Note: This model might require checking the exact model name on Hugging Face Hub
train_model("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0", "/content/drive/MyDrive/afroxmlr_output")

Training with afroxmlr...


Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/236 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([8, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.007251,0.997188,0.986872,0.991428,0.989145
2,No log,0.006261,0.997971,0.988216,0.995384,0.991787
3,0.033400,0.001214,0.999573,0.998022,0.998351,0.998187
4,0.033400,0.000715,0.999786,0.998682,0.999011,0.998846
5,0.001400,0.001158,0.999644,0.998022,0.998351,0.998187
6,0.001400,0.001889,0.999608,0.997694,0.998351,0.998022
7,0.000300,0.001085,0.999751,0.998682,0.999011,0.998846
8,0.000300,0.001365,0.999751,0.998682,0.999011,0.998846
9,0.000100,0.001507,0.99968,0.998352,0.998681,0.998517
10,0.000100,0.001573,0.999751,0.998682,0.999011,0.998846


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 3336436224 vs 3336436116

In [2]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Replace "path/to/your/saved/model" with the actual path where you saved your best model
# For example, if you trained XLM-Roberta and it performed best, use "/content/drive/MyDrive/xlm_roberta_output"
model_path = "/content/drive/MyDrive/xlm_roberta_output/checkpoint-1180"

# Print the model path to confirm
print(f"Attempting to load model from: {model_path}")

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create a NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text to test (Amharic e-commerce text)
text = "ይህን ምርት በ1500 ብር መግዛት እችላለሁ።" # "I can buy this product for 1500 Birr."

# Get predictions
predictions = ner_pipeline(text)

# Print the predictions
print(predictions)

Attempting to load model from: /content/drive/MyDrive/xlm_roberta_output/checkpoint-1180


Device set to use cpu


[]


In [41]:
!ls "/content/drive/MyDrive/xlm_roberta_output"


checkpoint-1180  checkpoint-472  checkpoint-944
checkpoint-236	 checkpoint-708  runs
