# Fine Tune Name Entity Recognition Model

In [1]:
# Import necessery libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report
import numpy as np
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
import sys, os
sys.path.append(os.path.abspath('..'))   

In [3]:
from scripts.fine_tune import load_conll_data, prepare_dataset, tokenize_and_align_labels, train_and_evaluate

## Load Data

In [6]:
file_path = "../data/labeled_data.conll"
sentences, labels = load_conll_data(file_path)

In [9]:
# Prepare dataset
dataset = prepare_dataset(sentences, labels)
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 5691
})

## Fine Tune
**Note:** Fine tune on three models
* xlm-roberta-base 
* bert-tiny-amharic, 
* afroxmlr

In [10]:
models = {
    "xlm-roberta-base": "xlm-roberta-base",
    "bert-tiny-amharic": "Davlan/bert-tiny-amharic",
    "afroxmlr": "Davlan/afroxlmr-large"
}

# Main execution
label_list = list(set([l for sublist in labels for l in sublist]))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}


# tokonize and align labels
tokenizer = AutoTokenizer.from_pretrained(models["xlm-roberta-base"])
dataset = dataset.map(lambda e: tokenize_and_align_labels(e, tokenizer, label_to_id), batched=False)
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset['train']
val_dataset = dataset['test']

Map: 100%|██████████| 5691/5691 [00:01<00:00, 2981.90 examples/s]


In [11]:
# Train and evaluate each model
for model_name in models.values():
    print(f"Training and evaluating: {model_name}")
    train_and_evaluate(model_name, train_dataset, val_dataset, id_to_label)



Training and evaluating: xlm-roberta-base


ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
