In [1]:
!pip install transformers datasets seqeval accelerate


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-non

Upload and Parse the CoNLL File

In [2]:
from datasets import Dataset
import pandas as pd

def read_conll(file_path):
    sentences = []
    tokens, labels = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
            else:
                splits = line.split('\t')
                if len(splits) == 2:
                    token, label = splits
                    tokens.append(token)
                    labels.append(label)

    return sentences

data = read_conll("ner_amharic_sample.conll")
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)
dataset


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 40
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 10
    })
})

Encode Labels and Define Tag Mapping



In [3]:
label_list = sorted(set(tag for row in data for tag in row["ner_tags"]))
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {v: k for k, v in label2id.items()}

print(label2id)


{'O': 0}


 Load Model and Tokenizer (Choose One)

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "Davlan/afro-xlmr-base"  # or "Davlan/bert-tiny-amharic" or "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize Dataset with Aligned Labels

In [6]:
from datasets import ClassLabel

def align_labels_with_tokens(tokens, labels, tokenizer):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()

    aligned_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            aligned_labels.append(-100)
        elif word_id != previous_word_id:
            aligned_labels.append(label2id[labels[word_id]])
        else:
            aligned_labels.append(label2id[labels[word_id]] if labels[word_id].startswith("I-") else -100)
        previous_word_id = word_id

    return tokenized_inputs, aligned_labels

def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    tokenized_inputs, aligned_labels = align_labels_with_tokens(tokens, labels, tokenizer)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Set Up Training Arguments

In [8]:
from transformers import TrainingArguments

args = TrainingArguments(
    "ner-amharic-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_strategy="epoch"
)

Train the Model

In [11]:
from transformers import Trainer, DataCollatorForTokenClassification
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels, true_preds = [], []
    for pred, label in zip(predictions, labels):
        true_label = [id2label[l] for (l, p) in zip(label, pred) if l != -100]
        true_pred = [id2label[p] for (l, p) in zip(label, pred) if l != -100]
        true_labels.append(true_label)
        true_preds.append(true_pred)

    return seqeval.compute(predictions=true_preds, references=true_labels)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhana2729tm[0m ([33mhana2729tm-hana[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


TrainOutput(global_step=25, training_loss=0.0, metrics={'train_runtime': 1113.6303, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.022, 'total_flos': 3535639773792.0, 'train_loss': 0.0, 'epoch': 5.0})

In [10]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


 Save the Model

In [12]:
trainer.save_model("ner_amharic_model")
tokenizer.save_pretrained("ner_amharic_model")


('ner_amharic_model/tokenizer_config.json',
 'ner_amharic_model/special_tokens_map.json',
 'ner_amharic_model/sentencepiece.bpe.model',
 'ner_amharic_model/added_tokens.json',
 'ner_amharic_model/tokenizer.json')

Evaluate on Test Set



In [13]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.0, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 1.0, 'eval_runtime': 2.3896, 'eval_samples_per_second': 4.185, 'eval_steps_per_second': 0.837, 'epoch': 5.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
