<a href="https://colab.research.google.com/github/Kalze1/Amharic_Named_Entity_Recognition/blob/main/notebooks/Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers datasets seqeval

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch

# Function to read CoNLL formatted data
def read_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        sentence = []
        label = []
        for line in file:
            line = line.strip()
            if not line:  # New sentence
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your labeled data (replace with your actual file path)
sentences, labels = read_conll("/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/labeled_cleaned_tokenized_dataset.conll")

# Create a DataFrame
data = {'tokens': sentences, 'ner_tags': labels}
df = pd.DataFrame(data)

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Set the label list and mapping
all_labels = [label for sublist in labels for label in sublist]
unique_labels = sorted(set(all_labels))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(unique_labels)

print("Unique labels in the dataset:", unique_labels)
print("Label to ID Mapping:", label_to_id)

# Load pre-trained model and tokenizer
model_name = "bert-base-multilingual-cased"  # Use mBERT (you can adjust this to any model)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensure all sequences are padded to max length
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:  # New word
                label_ids.append(label_to_id[label[word_idx]])
            else:
                # Sub-token: Ignore labels, append -100
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenizer and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split the dataset into train and validation sets
train_test_split_ratio = 0.8
split_dataset = tokenized_dataset.train_test_split(test_size=1 - train_test_split_ratio, seed=42)
train_dataset = split_dataset['train']
validation_dataset = split_dataset['test']

# Define the data collator (handles padding for token classification)
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Define the Trainer with the updated datasets and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer for mBERT
save_path_bert = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-bert"
model.save_pretrained(save_path_bert)
tokenizer.save_pretrained(save_path_bert)

# Example of loading and fine-tuning DistilBERT or XLM-Roberta similarly

# Load DistilBERT for token classification
model_name_distilbert = "distilbert-base-multilingual-cased"
tokenizer_distilbert = AutoTokenizer.from_pretrained(model_name_distilbert)
model_distilbert = AutoModelForTokenClassification.from_pretrained(model_name_distilbert, num_labels=num_labels)

# Train the model on DistilBERT
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer_distilbert,
    data_collator=data_collator
)

trainer_distilbert.train()

# Save the fine-tuned DistilBERT model and tokenizer
save_path_distilbert = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert"
model_distilbert.save_pretrained(save_path_distilbert)
tokenizer_distilbert.save_pretrained(save_path_distilbert)

# Load XLM-Roberta for token classification
model_name_xlmroberta = "xlm-roberta-base"
tokenizer_xlmroberta = AutoTokenizer.from_pretrained(model_name_xlmroberta)
model_xlmroberta = AutoModelForTokenClassification.from_pretrained(model_name_xlmroberta, num_labels=num_labels)

# Train the model on XLM-Roberta
trainer_xlmroberta = Trainer(
    model=model_xlmroberta,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer_xlmroberta,
    data_collator=data_collator
)

trainer_xlmroberta.train()

# Save the fine-tuned XLM-Roberta model and tokenizer
save_path_xlmroberta = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-xlm-roberta"
model_xlmroberta.save_pretrained(save_path_xlmroberta)
tokenizer_xlmroberta.save_pretrained(save_path_xlmroberta)


KeyboardInterrupt: 

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the fine-tuned mBERT model and tokenizer
save_path_bert = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-bert"
tokenizer_bert = AutoTokenizer.from_pretrained(save_path_bert)
model_bert = AutoModelForTokenClassification.from_pretrained(save_path_bert)

# Load the fine-tuned DistilBERT model and tokenizer
save_path_distilbert = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-distilbert"
tokenizer_distilbert = AutoTokenizer.from_pretrained(save_path_distilbert)
model_distilbert = AutoModelForTokenClassification.from_pretrained(save_path_distilbert)

# Load the fine-tuned XLM-Roberta model and tokenizer
save_path_xlmroberta = "/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-xlm-roberta"
tokenizer_xlmroberta = AutoTokenizer.from_pretrained(save_path_xlmroberta)
model_xlmroberta = AutoModelForTokenClassification.from_pretrained(save_path_xlmroberta)



OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/fine-tuned-model-bert'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [2]:
import numpy as np
from seqeval.metrics import classification_report

# Function to evaluate the model and print classification report
def evaluate_model(model, tokenizer, validation_dataset):
    # Create a new trainer object for evaluation
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Get predictions on the validation dataset
    predictions, labels, _ = trainer.predict(validation_dataset)

    # Convert logits to predicted labels
    predictions = np.argmax(predictions, axis=2)

    # Align predictions and true labels (skip padding labels -100)
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    predicted_labels = [
        [id_to_label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    # Generate classification report
    report = classification_report(true_labels, predicted_labels)
    print(report)

    return report

# Fine-tune the models as done before and then evaluate

# Example of evaluating the DistilBERT model
print("Evaluating DistilBERT:")
distilbert_report = evaluate_model(model_distilbert, tokenizer_distilbert, validation_dataset)

# Example of evaluating the XLM-Roberta model
print("Evaluating XLM-Roberta:")
xlmroberta_report = evaluate_model(model_xlmroberta, tokenizer_xlmroberta, validation_dataset)

# Example of evaluating the mBERT model
print("Evaluating mBERT:")
mbert_report = evaluate_model(model, tokenizer, validation_dataset)


Evaluating DistilBERT:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC       0.82      0.86      0.84       185
       PRICE       0.65      0.53      0.58       234
     PRODUCT       0.00      0.00      0.00        38

   micro avg       0.74      0.62      0.67       457
   macro avg       0.49      0.46      0.47       457
weighted avg       0.66      0.62      0.64       457

Evaluating XLM-Roberta:


NameError: name 'model_xlmroberta' is not defined

In [1]:
# Install the required libraries for SHAP and LIME
!pip install shap lime

import shap
import lime
from lime.lime_text import LimeTextExplainer
from transformers import pipeline

# Step 1: SHAP Interpretability for Token Classification

# Create a pipeline for NER using the trained DistilBERT model
ner_pipeline = pipeline('ner', model=model_distilbert, tokenizer=tokenizer_distilbert)

# Create a function to tokenize inputs and align the tokens for SHAP explanations
def tokenize_for_shap(text):
    tokens = tokenizer_distilbert(text, return_tensors="pt")
    return tokens

# SHAP explainer for deep learning models
explainer_shap = shap.Explainer(model_distilbert, tokenizer_distilbert)

# Example usage with SHAP on a few examples
example_sentences = ["This is an example sentence with Amharic and English entities.",
                     "የአማርኛ ስም እና የእንግሊዝኛ ስም በማብቂያ እንዲታይ።"]

# Compute SHAP values for the example sentences
shap_values = explainer_shap(example_sentences)

# Visualize the SHAP values for the first sentence
shap.plots.text(shap_values[0])

# Step 2: LIME for Local Interpretability

# Create a LIME explainer for token classification
class_names = unique_labels  # Use your unique NER labels
explainer_lime = LimeTextExplainer(class_names=class_names)

# Explain a difficult case using LIME
difficult_sentence = "The company was founded in 1998 in Ethiopia."

# Use the explainer to get local explanations for a difficult sentence
exp = explainer_lime.explain_instance(difficult_sentence, ner_pipeline)

# Display the LIME explanation
exp.show_in_notebook()

# Step 3: Analyze difficult cases and overlapping entities

# Function to identify ambiguous or difficult examples
def analyze_difficult_cases(dataset, model, tokenizer):
    ambiguous_cases = []
    for example in dataset:
        tokens = example["tokens"]
        labels = example["ner_tags"]
        pred = ner_pipeline(' '.join(tokens))

        # Simple heuristic: if the number of predicted entities doesn't match the actual entities
        if len(pred) != len(labels):
            ambiguous_cases.append(example)

    return ambiguous_cases

# Analyze ambiguous cases in your validation dataset
difficult_cases = analyze_difficult_cases(validation_dataset, model_distilbert, tokenizer_distilbert)

# Step 4: Generate interpretability reports

def generate_report(difficult_cases, model, tokenizer, label_map):
    print("Report on Difficult Cases:")
    for case in difficult_cases:
        tokens = case["tokens"]
        print(f"Tokens: {' '.join(tokens)}")
        true_labels = [id_to_label[label] for label in case["ner_tags"]]
        pred = ner_pipeline(' '.join(tokens))

        print(f"True labels: {true_labels}")
        print(f"Predicted labels: {pred}")
        print("-" * 80)

# Generate the report based on difficult cases found
generate_report(difficult_cases, model_distilbert, tokenizer_distilbert, id_to_label)


Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=4

NameError: name 'model_distilbert' is not defined