In [2]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=bd9727483c82856df77752a3150e5e168b026659af84d011a4f4754000d956f8
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report

In [8]:
def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        words, tags = [], []
        for line in f:
            if line.strip() == "":
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                splits = line.strip().split()
                words.append(splits[0])
                tags.append(splits[-1])
        if words:
            sentences.append(words)
            labels.append(tags)
    return sentences, labels

sentences, ner_tags = read_conll('conll_final.txt')

In [9]:
data = {'tokens': sentences, 'ner_tags': ner_tags}
dataset = Dataset.from_dict(data)
# Split into train/val
dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [10]:
model_checkpoint = "xlm-roberta-base"  # or "Davlan/bert-tiny-amharic" or "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
unique_tags = set(tag for doc in ner_tags for tag in doc)
label_list = sorted(list(unique_tags))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
args = TrainingArguments(
    "ner-amharic",
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [17]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "classification_report": classification_report(true_labels, true_predictions, output_dict=True)
    }

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [16]:
trainer.train()

Step,Training Loss
10,1.2646


TrainOutput(global_step=18, training_loss=1.0527410242292616, metrics={'train_runtime': 1148.5209, 'train_samples_per_second': 0.118, 'train_steps_per_second': 0.016, 'total_flos': 9960347703768.0, 'train_loss': 1.0527410242292616, 'epoch': 3.0})

In [17]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.698441743850708, 'eval_classification_report': {'LOC': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}, 'micro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}}, 'eval_runtime': 4.8141, 'eval_samples_per_second': 1.039, 'eval_steps_per_second': 0.208, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
trainer.save_model("amharic-ner-model")
tokenizer.save_pretrained("amharic-ner-model")

('amharic-ner-model/tokenizer_config.json',
 'amharic-ner-model/special_tokens_map.json',
 'amharic-ner-model/sentencepiece.bpe.model',
 'amharic-ner-model/added_tokens.json',
 'amharic-ner-model/tokenizer.json')

## Task 4

In [35]:
model_names = [
    "xlm-roberta-base",
    "bert-base-multilingual-cased",
    "Davlan/afro-xlmr-base"
]

# Define compute_metrics function
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "classification_report": classification_report(true_labels, true_predictions, output_dict=True)
    }

results = {}
for model_checkpoint in model_names:
    print(f"Processing model: {model_checkpoint}")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # Tokenize and align labels for the current model's tokenizer
    tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    args = TrainingArguments(
        f"ner-amharic-{model_checkpoint.split('/')[-1]}", # Unique output directory
        num_train_epochs=3,
        weight_decay=0.01,
        report_to="none"
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Pass id2label to compute_metrics
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=lambda p: compute_metrics(p, id2label)
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[model_checkpoint] = eval_result

print("\nEvaluation Results for all models:")
for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print(result)

Processing model: xlm-roberta-base


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))


Processing model: bert-base-multilingual-cased


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Processing model: Davlan/afro-xlmr-base


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



Evaluation Results for all models:

Model: xlm-roberta-base
{'eval_loss': 0.6671525835990906, 'eval_classification_report': {'LOC': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}, 'micro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}}, 'eval_runtime': 1.8377, 'eval_samples_per_second': 2.721, 'eval_steps_per_second': 0.544, 'epoch': 3.0}

Model: bert-base-multilingual-cased
{'eval_loss': 0.6452264189720154, 'eval_classification_report': {'LOC': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1-sc

  _warn_prf(average, modifier, msg_start, len(result))
