# Fine Tune NER Model

## Importing necessary libraries


In [None]:
import numpy as np
from datasets import load_metric
from datasets import Dataset, Features, Value, ClassLabel, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

## Loading the dataset


In [None]:
# Define the features based on the CoNLL format
features = Features({
    "tokens": [Value(dtype='string')],
    "ner_tags": [ClassLabel(names=['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC'])]
})

def parse_conll_file(file_path):
    """Parses a CoNLL file into a list of examples."""
    data = []
    tokens = []
    ner_tags = []
    # Get the ClassLabel object from the list
    ner_tag_feature = features["ner_tags"][0]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                # Space-separated columns, and the second column is the NER tag
                parts = line.split()
                tokens.append(parts[0])
                # Map potential different NER tags to the defined ClassLabel names
                tag = parts[-1]
                # Ensure the tag is in the defined ClassLabel names, default to 'O' if not found
                if tag in ner_tag_feature.names:
                    ner_tags.append(tag)
                else:
                     # Handle potential other columns or formats
                     # For simplicity, if it doesn't match expected NER tags, treat as 'O'
                     ner_tags.append('O')
            elif tokens:
                # Before appending, ensure tokens and ner_tags have the same length
                if len(tokens) == len(ner_tags):
                    data.append({"tokens": tokens, "ner_tags": ner_tags})
                else:
                    # Handle cases where token and tag counts don't match (e.g., malformed line)
                    print(f"Skipping malformed example with {len(tokens)} tokens and {len(ner_tags)} tags.")
                tokens = []
                ner_tags = []
        # Add the last example if the file doesn't end with an empty line
        if tokens and len(tokens) == len(ner_tags):
             data.append({"tokens": tokens, "ner_tags": ner_tags})
        elif tokens:
             print(f"Skipping malformed final example with {len(tokens)} tokens and {len(ner_tags)} tags.")
    return data

all_data = parse_conll_file("/content/telegram_data_conll.txt")

# Convert the list of examples into a Dataset
full_dataset = Dataset.from_dict({
    "tokens": [example["tokens"] for example in all_data],
    "ner_tags": [example["ner_tags"] for example in all_data]
})

# Cast the columns to the defined features.
full_dataset = full_dataset.cast(features)

# Split the dataset into training and validation sets
raw_datasets = full_dataset.train_test_split(test_size=0.3) # 30% for validation

# Rename the splits for clarity
raw_datasets["train"] = raw_datasets.pop("train")
raw_datasets["validation"] = raw_datasets.pop("test")


# Display the dataset structure to verify
print(raw_datasets)

Casting the dataset:   0%|          | 0/23592 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 16514
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 7078
    })
})


## Preparing the data


In [None]:
# Load the tokenizer
# Using "Davlan/afro-xlmr-mini" as an alternative model
tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-mini")

def tokenize_and_align_labels(examples):
    """Tokenizes the data and aligns labels with tokens."""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        padding="max_length", 
        truncation=True 
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. Set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # Label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the function to the raw_datasets
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

# Remove the original columns
tokenized_datasets = tokenized_datasets.remove_columns(["tokens", "ner_tags"])

print(tokenized_datasets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/16514 [00:00<?, ? examples/s]

Map:   0%|          | 0/7078 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16514
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7078
    })
})


## Loading the pre-trained model

In [None]:
# Get the number of unique NER tags from the dataset
ner_feature = raw_datasets["train"].features["ner_tags"][0]
label_names = ner_feature.names
num_labels = len(label_names)

# Load the pre-trained model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    "Davlan/afro-xlmr-mini", num_labels=num_labels
)

print(model)

config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/472M [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 384, padding_idx=1)
      (position_embeddings): Embedding(514, 384, padding_idx=1)
      (token_type_embeddings): Embedding(1, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=384, out_features=384

## Training arguments

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for checkpoints and predictions
    eval_strategy="epoch",  # Evaluate every epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,  # Reduced batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Strength of weight decay
    report_to="none", # Disable reporting to external services
    # gradient_accumulation_steps=2, # Accumulate gradients over 2 steps
    fp16=True # Enable mixed precision training
)

## Fine-tuning the model

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the validation set for evaluation
    tokenizer=tokenizer,
)

# Start training
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.2103,0.155968
2,0.075,0.062818
3,0.048,0.043796


TrainOutput(global_step=6195, training_loss=0.23747226847478514, metrics={'train_runtime': 1244.1176, 'train_samples_per_second': 39.821, 'train_steps_per_second': 4.979, 'total_flos': 3241259550111744.0, 'train_loss': 0.23747226847478514, 'epoch': 3.0})

## Evaluating the model

In [None]:
# Load the seqeval metric
metric = load_metric("seqeval")

# Function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Add the compute_metrics function to the Trainer
trainer.compute_metrics = compute_metrics

# Evaluate the model on the validation set
evaluation_results = trainer.evaluate()

print(evaluation_results)

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

{'eval_loss': 0.04379603639245033, 'eval_precision': 0.9935806855343373, 'eval_recall': 0.99368098035653, 'eval_f1': 0.9936308304145511, 'eval_accuracy': 0.9986433363244028, 'eval_runtime': 37.6334, 'eval_samples_per_second': 188.078, 'eval_steps_per_second': 23.516, 'epoch': 3.0}


## Saving the trained model

In [8]:
# Define the directory to save the model
save_directory = "./fine_tuned_amharic_ner_model"

# Save the model and tokenizer
trainer.save_model(save_directory)

print(f"Model saved to {save_directory}")

Model saved to ./fine_tuned_amharic_ner_model
