# Fine Tune Name Entity Recognition Model

* Fine-tune a Named Entity Recognition (NER) model to extract key entities (e.g., products, prices, and location) from Amharic Telegram messages.

In [1]:
# Import necessery libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report
import numpy as np
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys, os
sys.path.append(os.path.abspath('..'))   

In [3]:
from scripts.XLM_Roberta_fine_tune import load_conll_data, prepare_dataset, tokenize_and_align_labels, train_and_evaluate

## Load Data

In [4]:
file_path = "../data/labeled_telegram_data.conll"
sentences, labels = load_conll_data(file_path)

In [5]:
# Prepare dataset
dataset = prepare_dataset(sentences, labels)
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 6169
})

## Fine Tune
**Note:** Fine tune on three models
* xlm-roberta-base 


In [7]:
model_name = "xlm-roberta-base"

# Main execution
label_list = list(set([l for sublist in labels for l in sublist]))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}


# tokonize and align labels
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = dataset.map(lambda e: tokenize_and_align_labels(e, tokenizer, label_to_id), batched=False)
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset['train']
val_dataset = dataset['test']

Map: 100%|██████████| 6169/6169 [00:02<00:00, 2716.95 examples/s]


## Model Training and evaluation

In [10]:
print("Training and evaluating: xlm-roberta-base")
trainer = train_and_evaluate(model_name, train_dataset, val_dataset, id_to_label)

Training and evaluating: xlm-roberta-base


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.009675
2,0.049200,0.0046
3,0.049200,0.003023


Evaluation for xlm-roberta-base
              precision    recall  f1-score   support

         LOC       0.99      1.00      0.99       337
    LOCATION       0.98      1.00      0.99       105
       PRICE       1.00      1.00      1.00      2335
     PRODUCT       0.99      0.99      0.99      1187

   micro avg       1.00      1.00      1.00      3964
   macro avg       0.99      1.00      0.99      3964
weighted avg       1.00      1.00      1.00      3964



In [12]:
# Save the fine-tuned model
trainer.save_model("../data/xlm-roberta-base-model")