##### Load cleaned and merged diffrent channel data conell file and Fine Tune the three Models 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
def load_conll_format(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = []
        sentence = []
        labels = []

        for line in f:
            line = line.strip()
            if line == "":
                if sentence:  # Only append if the sentence list is not empty
                    data.append((sentence, labels))
                    sentence = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) == 2:  # Ensure there are exactly two parts
                    token, label = parts
                    sentence.append(token)
                    labels.append(label)
                else:
                    print(f"Skipping line: {line}")  # Optional: print to debug which lines are problematic

        if sentence:  # Append the last sentence if the file doesn't end with a newline
            data.append((sentence, labels))

    return pd.DataFrame(data, columns=['tokens', 'labels'])

df = load_conll_format("cleaned_file.conll")

In [2]:
df

Unnamed: 0,tokens,labels
0,"[ዋጋ, 3500, ብር, ለሱቅና, ብዛት, ተረካቢወች, ባስተያየት, እናስረ...","[B-PRICE, I-PRICE, I-PRICE, O, O, O, O, O, O, ..."
1,"[ይሄንን, ተጭነው, ያድርጉ፣, ቤተሰብ, ይሁኑ]","[O, O, O, O, O]"
2,"[የመኪና, መዓዛ, በፀሃይ, ብርሃን, ስለሚሰራ, ቻርጅ, ማድረግ, አይፈል...","[I-PRODUCT, B-PRODUCT, O, O, O, O, O, O, I-PRO..."
3,"[የፀጉር, ማድረቂያ, ፎን, 6000watt, በጣም, ፈጣን, ማበጠሪያ, ያ...","[I-PRODUCT, I-PRODUCT, B-PRODUCT, O, O, O, O, ..."
4,"[2, 1, የሰላጣ, እና, የቺፕስ, ማቅረቢያ, 2400, ብር, ሱቃችን, ...","[O, O, O, I-PRODUCT, I-PRODUCT, B-PRODUCT, B-P..."
...,...,...
22187,"[ብር, ራት, ዋስትና, ቅናሽ, አራሻ, ቁር, መናኛ, ዘመሽ, ራን, ሞ, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
22188,"[ዋ, ብር, ውስን, ሬ, ነው, የቀረው, ራት, ዋስትና, ቅናሽ, አራሻ, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
22189,"[ዋ, ብር, ውስን, ሬ, ነው, የቀረው, ራት, ዋስትና, ቅናሽ, አራሻ, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
22190,"[ዋ, ብር, ውስን, ሬ, ነው, የቀረው, ራት, ዋስትና, ቅናሽ, አራሻ, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


#### select 50% dataset for the sake of time

In [3]:
# Sample a specific number of rows (e.g., 10)
sample_df = df.sample(n=11000, random_state=42)  # random_state for reproducibility


##### Give number for each label

In [5]:
# Define label mappings
label_to_id = {
    "O": 0,
    "B-PRICE": 1,
    "I-PRICE": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-PRODUCT": 5,
    "I-PRODUCT": 6,
    # Add other labels as needed
}

In [4]:
df =sample_df
df

Unnamed: 0,tokens,labels
221,"[አንደኛ, ደረጃ, የጎማ, ላስትሮ, ማንኛውንም, ነገር, በከፍተኛ, ደረጃ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
15180,"[አስ, ሞ, ሱሪዎች, ብር, ሲና, ኪት, ራት, የበቁ, የች, እቃ, አስመ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1698,"[ልዩ, የበዓል, ቅናሽ, የቡና, ረከቦት, በአልኮል/በሻማ, ቡና, ማሞቂያ...","[O, O, O, O, O, O, O, O, I-PRODUCT, I-PRODUCT,..."
3115,"[ማስታወሻ፦, ዛሬ, ሱቃችን, ክፍት, ስለሆነ, ይጎብኙን]","[O, O, O, O, O, O]"
1328,"[የውሃ, ማቅረቢያ, ጆግ, የማይሰበር, የማይጫጫር, 1.2, ሊትር, ውሃ,...","[I-PRODUCT, B-PRODUCT, O, O, O, O, O, O, O, O,..."
...,...,...
6531,"[የመረያ, እና, መወወያ, መስቀያ, ባ, አምስት, መያዣ, ያው, በሪ, የ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
15898,"[አራሻ, ሚና, ኛ, ቅ, አንኛን, ስክ, የቤት, ቁር, እና]","[O, O, O, O, O, O, O, O, O]"
7236,"[ሱቃችን, ነ, እ, ከ, ዋቱ, ሰዓት, እስከ, ቀኑ, ረስ, ክት, መኑን,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
21948,"[ሳቺ, ባ, ት, የስ, መ, እና, የቡና, መ, ሳቺ, ው, ማያ, ውስን, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Model names for fine-tuning
model_names = {
    "xlm-roberta": "xlm-roberta-base",
    "distilbert": "distilbert-base-multilingual-cased",
    "mbert": "bert-base-multilingual-cased"
}

tokenizers = {}
models = {}

# Load tokenizers and models
for model_key, model_name in model_names.items():
    tokenizers[model_key] = AutoTokenizer.from_pretrained(model_name)
    models[model_key] = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predict

In [7]:
from datasets import Dataset

# Define function to tokenize and align labels
def tokenize_and_align_labels(dataframe, tokenizer):
    tokenized_inputs = tokenizer(
        list(dataframe['tokens']),
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(dataframe['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Use -100 for padding tokens

        for word_index in range(len(word_ids)):
            if word_ids[word_index] is not None:  # Check if it's a valid word token
                current_label = label[word_ids[word_index]]
                label_ids[word_index] = label_to_id[current_label]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize data for each model
tokenized_data = {}
for model_key, tokenizer in tokenizers.items():
    tokenized_data[model_key] = tokenize_and_align_labels(df, tokenizer)


In [8]:
# Convert the tokenized data into Hugging Face Datasets for training and validation
datasets = {}
for model_key, data in tokenized_data.items():
    dataset = Dataset.from_dict({
        'input_ids': data['input_ids'],
        'attention_mask': data['attention_mask'],
        'labels': data['labels']
    })

    # Split the dataset into training and validation sets (80/20 split)
    datasets[model_key] = dataset.train_test_split(test_size=0.2)


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import Trainer, TrainingArguments
import warnings
import os

warnings.filterwarnings("ignore")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten and filter out padding tokens (-100)
    true_labels = []
    predicted_labels = []

    for i in range(len(labels)):
        true_labels.extend([label for label, pred_label in zip(labels[i], preds[i]) if label != -100])
        predicted_labels.extend([pred_label for label, pred_label in zip(labels[i], preds[i]) if label != -100])

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="macro")
    recall = recall_score(true_labels, predicted_labels, average="macro")
    f1 = f1_score(true_labels, predicted_labels, average="macro")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Assuming models, tokenizers, and datasets are defined elsewhere
# models = {...}
# tokenizers = {...}  # Tokenizers corresponding to the models
# datasets = {...}

trainers = {}

# Define TrainingArguments for Trainer
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save checkpoints and results
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=10,  # Number of steps between logging updates
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Example: Train for 3 epochs
    learning_rate=2e-5,
    logging_first_step=True,
    report_to='none'
)

# Create trainers for each model
for model_key, model in models.items():
    # Retrieve the corresponding tokenizer
    tokenizer = tokenizers[model_key]

    trainers[model_key] = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets[model_key]['train'],
        eval_dataset=datasets[model_key]['test'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,  # Pass the tokenizer to the Trainer
    )

# Train, evaluate, and save each model
results = {}
for model_key, trainer in trainers.items():
    print(f"Training and evaluating model: {model_key}")

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()

    # Save the model to a specific directory for each model
    model_save_path = f"./saved_models/{model_key}"
    os.makedirs(model_save_path, exist_ok=True)  # Create directory if it doesn't exist
    trainer.save_model(model_save_path)  # Save the model

    # Save the tokenizer (since we passed it to the trainer)
    trainer.tokenizer.save_pretrained(model_save_path)

    # Save the results
    results[model_key] = eval_result
    print(f"Results for {model_key}: {eval_result}")

# Optionally, you can access the evaluation results per epoch from the Trainer object directly.


Training and evaluating model: xlm-roberta


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0494,0.055149,0.981691,0.817373,0.858708,0.831693
2,0.0307,0.035472,0.989987,0.919733,0.878854,0.897486
3,0.0157,0.032799,0.991174,0.919893,0.905673,0.911834


Results for xlm-roberta: {'eval_loss': 0.03279938921332359, 'eval_accuracy': 0.9911743426321226, 'eval_precision': 0.9198928449667713, 'eval_recall': 0.905673364015339, 'eval_f1': 0.9118335249464545, 'eval_runtime': 64.5414, 'eval_samples_per_second': 34.087, 'eval_steps_per_second': 2.138, 'epoch': 3.0}
Training and evaluating model: distilbert


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0858,0.080948,0.976905,0.85722,0.577204,0.610676
2,0.0758,0.064311,0.981258,0.858502,0.638549,0.680034
3,0.0492,0.061613,0.98232,0.863062,0.667563,0.71198


Results for distilbert: {'eval_loss': 0.061613138765096664, 'eval_accuracy': 0.9823204037091354, 'eval_precision': 0.8630623989842897, 'eval_recall': 0.6675634020561317, 'eval_f1': 0.7119796337088509, 'eval_runtime': 35.8111, 'eval_samples_per_second': 61.434, 'eval_steps_per_second': 3.854, 'epoch': 3.0}
Training and evaluating model: mbert


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0685,0.069234,0.980205,0.695936,0.622897,0.63866
2,0.0844,0.057881,0.98294,0.842734,0.681745,0.70846
3,0.0397,0.05539,0.984248,0.858444,0.69979,0.732102


Results for mbert: {'eval_loss': 0.0553898811340332, 'eval_accuracy': 0.9842482881114305, 'eval_precision': 0.8584440357060645, 'eval_recall': 0.69978980785627, 'eval_f1': 0.7321018486535106, 'eval_runtime': 62.084, 'eval_samples_per_second': 35.436, 'eval_steps_per_second': 2.223, 'epoch': 3.0}


#### Test the code by sample data

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Step 1: Load the saved model and tokenizer for XLM-Roberta
model_name = "xlm-roberta"
model_path = f"./saved_models/{model_name}"  # Adjust the path if needed
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Sample input data (space-separated text)
sample_input_data = [
    "ጀሞ 1 ከለላ ህንፃ ግራውንድ ለይ G07 ስቶቭ"
]

# Tokenization and prediction for XLM-Roberta
print(f"\nTesting model: {model_name}")

for sample_text in sample_input_data:
    # Manually split the input text by spaces
    tokens = sample_text.split(" ")

    # Convert tokens to input IDs using the corresponding tokenizer
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    inputs = {
        "input_ids": torch.tensor([input_ids]),  # Add batch dimension
        "attention_mask": torch.tensor([[1] * len(input_ids)])  # Attention mask for the input (all 1s for this example)
    }

    # Step 3: Make predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Step 4: Process the model's output
    logits = outputs.logits
    predicted_label_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

    # Map predicted label ids back to their label names
    label_mapping = {
        0: 'O',
        1: 'B-PRICE',
        2: 'I-PRICE',
        3: 'B-LOC',
        4: 'I-LOC',
        5: 'B-PRODUCT',
        6: 'I-PRODUCT'
    }

    # Get predicted labels
    predicted_labels = [label_mapping[label_id] for label_id in predicted_label_ids]

    # Display the tokens and corresponding predicted labels
    for token, label in zip(tokens, predicted_labels):
        print(f"{token}: {label}")

#### shap model interpritation

In [None]:

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import shap
from lime.lime_text import LimeTextExplainer

# Load the saved model and tokenizer
model_path = "/content/saved_models/xlm-roberta"  # Adjust path as needed
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Determine if a GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import shap

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("/content/saved_models/xlm-roberta")
tokenizer = AutoTokenizer.from_pretrained("/content/saved_models/xlm-roberta")

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the prediction function for SHAP
def model_predict(input_texts):
    # Tokenize the input texts
    # Ensure that input_texts is in the correct format for the tokenizer
    inputs = tokenizer(input_texts, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Make predictions without computing gradients
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits from the model outputs
    logits = outputs.logits

    # Return the predictions (logits)
    return logits.cpu().numpy()

# Create a SHAP masker for text
masker = shap.maskers.Text(tokenizer)

# Create SHAP explainer with the masker
explainer = shap.Explainer(model_predict, masker)

# Define the sample texts for SHAP to explain
# Assuming these are already tokenized, format as lists of lists for SHAP
texts = [
    ["አዲስ", "አበባ", "ስልክ", "በ", "3000", "ብር"],
    ["በ", "አንድ", "ቀን", "በ", "5000", "ብር", "ሽርሽር"]
]

# Ensure that texts are passed as a list of lists
# Convert each inner list to a single string (optional based on your input style)
texts_as_strings = [' '.join(text) for text in texts]

# Debugging: Check the formatted texts
print(f"Formatted texts for SHAP: {texts_as_strings}")

# Get SHAP values for the sample texts
shap_values = explainer(texts_as_strings)

# Visualize SHAP values
shap.plots.text(shap_values)

# Optional: Save the SHAP values plot to a file
shap.save_html("shap_values_plot.html", shap_values)  # Save as HTML file


#####Lime model interpretation

In [None]:
def lime_predict(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()  # Get probabilities
    return probs

# Create LIME explainer for the saved model
explainer = LimeTextExplainer(class_names=list(label_mapping.values()))

# Use the same data you used for evaluation or training
texts = ["አዲስ አበባ ስልክ በ 3000 ብር", "በ አንድ ቀን በ 5000 ብር ሽርሽር"]

# Iterate over the batch and explain each one
for text in texts:
    exp = explainer.explain_instance(text, lime_predict, num_features=6)
    exp.show_in_notebook()
