<a href="https://colab.research.google.com/github/Kalze1/Amharic_Named_Entity_Recognition/blob/task-4/notebook/Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers datasets seqeval

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch

# Function to read CoNLL formatted data
def read_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        sentence = []
        label = []
        for line in file:
            line = line.strip()
            if not line:  # New sentence
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your labeled data (replace with your actual file path)
sentences, labels = read_conll("/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/labeled_cleaned_tokenized_dataset.conll")

# Create a DataFrame
data = {'tokens': sentences, 'ner_tags': labels}
df = pd.DataFrame(data)

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Set the label list and mapping
all_labels = [label for sublist in labels for label in sublist]
unique_labels = sorted(set(all_labels))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(unique_labels)

print("Unique labels in the dataset:", unique_labels)
print("Label to ID Mapping:", label_to_id)

# Load pre-trained model and tokenizer
model_name = "bert-base-multilingual-cased"  # Use mBERT (you can adjust this to any model)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensure all sequences are padded to max length
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:  # New word
                label_ids.append(label_to_id[label[word_idx]])
            else:
                # Sub-token: Ignore labels, append -100
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenizer and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split the dataset into train and validation sets
train_test_split_ratio = 0.8
split_dataset = tokenized_dataset.train_test_split(test_size=1 - train_test_split_ratio, seed=42)
train_dataset = split_dataset['train']
validation_dataset = split_dataset['test']

# Define the data collator (handles padding for token classification)
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# # Define the Trainer with the updated datasets and data collator
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=validation_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator
# )

# # Train the model
# trainer.train()

# # Save the fine-tuned model and tokenizer for mBERT
# save_path_bert = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-bert"
# model.save_pretrained(save_path_bert)
# tokenizer.save_pretrained(save_path_bert)

# Example of loading and fine-tuning DistilBERT or XLM-Roberta similarly

# Load DistilBERT for token classification
model_name_distilbert = "distilbert-base-multilingual-cased"
tokenizer_distilbert = AutoTokenizer.from_pretrained(model_name_distilbert)
model_distilbert = AutoModelForTokenClassification.from_pretrained(model_name_distilbert, num_labels=num_labels)

# Train the model on DistilBERT
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer_distilbert,
    data_collator=data_collator
)

trainer_distilbert.train()

# Save the fine-tuned DistilBERT model and tokenizer
save_path_distilbert = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert"
model_distilbert.save_pretrained(save_path_distilbert)
tokenizer_distilbert.save_pretrained(save_path_distilbert)

# # Load XLM-Roberta for token classification
# model_name_xlmroberta = "xlm-roberta-base"
# tokenizer_xlmroberta = AutoTokenizer.from_pretrained(model_name_xlmroberta)
# model_xlmroberta = AutoModelForTokenClassification.from_pretrained(model_name_xlmroberta, num_labels=num_labels)

# # Train the model on XLM-Roberta
# trainer_xlmroberta = Trainer(
#     model=model_xlmroberta,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=validation_dataset,
#     tokenizer=tokenizer_xlmroberta,
#     data_collator=data_collator
# )

# trainer_xlmroberta.train()

# # Save the fine-tuned XLM-Roberta model and tokenizer
# save_path_xlmroberta = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-xlm-roberta"
# model_xlmroberta.save_pretrained(save_path_xlmroberta)
# tokenizer_xlmroberta.save_pretrained(save_path_xlmroberta)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Unique labels in the dataset: ['B-LOC', 'B-PRICE', 'B-PRODUCT', 'I-LOC', 'I-PRICE', 'O']
Label to ID Mapping: {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2, 'I-LOC': 3, 'I-PRICE': 4, 'O': 5}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/974 [00:00<?, ? examples/s]



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3809,0.279945
2,0.2274,0.186985
3,0.1653,0.164263


('/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert/tokenizer_config.json',
 '/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert/special_tokens_map.json',
 '/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert/vocab.txt',
 '/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert/added_tokens.json',
 '/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert/tokenizer.json')

In [None]:
import numpy as np
from seqeval.metrics import classification_report

# Function to evaluate the model and print classification report
def evaluate_model(model, tokenizer, validation_dataset):
    # Create a new trainer object for evaluation
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Get predictions on the validation dataset
    predictions, labels, _ = trainer.predict(validation_dataset)

    # Convert logits to predicted labels
    predictions = np.argmax(predictions, axis=2)

    # Align predictions and true labels (skip padding labels -100)
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    predicted_labels = [
        [id_to_label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    # Generate classification report
    report = classification_report(true_labels, predicted_labels)
    print(report)

    return report

# Fine-tune the models as done before and then evaluate

# Example of evaluating the DistilBERT model
print("Evaluating DistilBERT:")
distilbert_report = evaluate_model(model_distilbert, tokenizer_distilbert, validation_dataset)

# Example of evaluating the XLM-Roberta model
print("Evaluating XLM-Roberta:")
xlmroberta_report = evaluate_model(model_xlmroberta, tokenizer_xlmroberta, validation_dataset)

# Example of evaluating the mBERT model
print("Evaluating mBERT:")
mbert_report = evaluate_model(model, tokenizer, validation_dataset)


Evaluating DistilBERT:


              precision    recall  f1-score   support

         LOC       0.80      0.83      0.82       185
       PRICE       0.60      0.47      0.52       234
     PRODUCT       0.00      0.00      0.00        38

   micro avg       0.70      0.58      0.63       457
   macro avg       0.47      0.43      0.45       457
weighted avg       0.63      0.58      0.60       457

Evaluating XLM-Roberta:


              precision    recall  f1-score   support

         LOC       0.75      0.79      0.77       185
       PRICE       0.02      0.01      0.02       234
     PRODUCT       0.00      0.00      0.00        38

   micro avg       0.45      0.33      0.38       457
   macro avg       0.26      0.27      0.26       457
weighted avg       0.32      0.33      0.32       457

Evaluating mBERT:


              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00       185
       PRICE       0.00      0.00      0.00       234
     PRODUCT       0.00      0.00      0.00        38

   micro avg       0.00      0.00      0.00       457
   macro avg       0.00      0.00      0.00       457
weighted avg       0.00      0.00      0.00       457

