In [1]:
# from importib import reload
%load_ext autoreload

In [2]:
import os
import sys
sys.path.append('..')

import torch
import evaluate
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForTokenClassification, AutoModelForSequenceClassification

from scripts.data_utils.loaders import *
from scripts.modeling.tokenizer import *
from scripts.modeling.training import train_ner_model

#get_peft_model: A function that wraps your base model with LoRA modules.
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

train_ner_model("resources/data", "resources/models/checkpoints")
from scripts.modeling.train_ner_model import train_ner_model

model_name = "xlm-roberta-base"  # Or "AfroXLMR"
dataset_dir = "resources/data"
output_dir = "resources/models/ner_model"

train_ner_model(
    model_name=model_name,
    dataset_dir=dataset_dir,
    output_dir=output_dir,
    learning_rate=5e-5,
    epochs=5,
    batch_size=8,
)


In [None]:
train_ner_model(data_dir)

In [18]:
DATA_PATH = os.path.join('..', 'resources', 'data')
tokenized_aligned_dir = os.path.join(DATA_PATH, 'tokenized_aligned')
filename = os.path.join(tokenized_aligned_dir, 'tokenized_aligned_messages.conll')

['ZemenExpress',
 'nevacomputer',
 'meneshayeofficial',
 'ethio_brand_collection',
 'Leyueqa',
 'sinayelj',
 'Shewabrand',
 'helloomarketethiopia',
 'modernshoppingcenter',
 'qnashcom',
 'Fashiontera',
 'kuruwear',
 'gebeyaadama',
 'MerttEka',
 'forfreemarket',
 'classybrands',
 'marakibrand',
 'aradabrand2',
 'marakisat2',
 'belaclassic',
 'AwasMart',
 'qnashcom']

In [None]:

TRAIN_DATA_PATH = "resources/data/ner_train.conll"
VAL_DATA_PATH = "resources/data/ner_val.conll"

# Load the train and validation data
train_data = load_conll_data(TRAIN_DATA_PATH)
val_data = load_conll_data(VAL_DATA_PATH)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)


# Map function and format datasets
label_mapping = ClassLabel(names=["O", "B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"])

# Load Model
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_mapping.names))



train_dataset = train_dataset.map(lambda x: {"labels": [[label_mapping.str2int(tag) for tag in seq] for seq in x["labels"]]})
val_dataset = val_dataset.map(lambda x: {"labels": [[label_mapping.str2int(tag) for tag in seq] for seq in x["labels"]]})

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)




In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./models/ner_finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train Model
trainer.train()

In [None]:
# Rename labels column to "labels" if needed for huggingface Trainer compatibility
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")


In [None]:

# Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, # string referencing the pre-trained model (e.g., "distilbert-base-uncased")
    num_labels=2  # For positive/negative classification
)

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence Classification
    r=8,                         # LoRA rank
    lora_alpha=32,               # adjusts how the LoRA updates get blended with existing weights.
    lora_dropout=0.1,            # Dropout rate to help prevent overfitting within the LoRA modules.
    bias="none",                 # whether to include bias parameters in LoRA modules
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]
)

# Convert the base model to a PEFT model using LoRA
peft_model = get_peft_model(base_model, lora_config)
#peft_model is now the model you’ll train, but only a small fraction of parameters (the LoRA parameters) will be trainable.

# Print trainable parameters for confirmation
trainable_params = 0
all_params = 0
for name, param in peft_model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"Trainable params: {trainable_params} | All params: {all_params} | Trainable%: {100 * trainable_params/all_params:.2f}%")


In [None]:

# For demonstration, we'll make a small training set
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(2000))
small_eval_dataset  = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./lora-distilbert-imdb",
    evaluation_strategy="epoch",    #Evaluates the model after each epoch.
    save_strategy="epoch",          #Saves a checkpoint at the end of every epoch
    logging_strategy="steps",       # Logs training metrics every few steps
    logging_steps=50,
    per_device_train_batch_size=8,  #Batch size per device
    per_device_eval_batch_size=8,
    num_train_epochs=2,  #  Number of full passes through the training data.
    weight_decay=0.01,   # A regularization hyperparameter to help prevent overfitting.
    learning_rate=1e-4,  # Step size for the optimizer.
    push_to_hub=False,   # Disable pushing to Hugging Face Hub
    report_to="none"     # Disable logging to W&B
    )


In [None]:

# Define a simple accuracy metric

metric_accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric_accuracy.compute(predictions=predictions, references=labels)

# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

# Evaluate on the small evaluation set
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


In [None]:


device = torch.device("cpu")
peft_model = peft_model.to(device)

def predict_sentiment(review_text):
    encoding = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = peft_model(**encoding)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=-1).item()
    return "Positive" if predicted_label == 1 else "Negative"

test_reviews = [
    "I absolutely loved this movie! The acting was great and the story kept me on the edge of my seat.",
    "The plot was dull and I nearly fell asleep. Would not recommend."
]

for review in test_reviews:
    print(f"Review: {review}\nPrediction: {predict_sentiment(review)}\n")



# 4

In [None]:
from scripts.modeling.model_comparison import fine_tune_multiple_models

models = [
    "xlm-roberta-base",
    "Davlan/afroxlmr-large",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased",
]
dataset_dir = "resources/data"
base_output_dir = "resources/models/comparison"

params = {
    "learning_rate": 5e-5,
    "epochs": 5,
    "batch_size": 8,
}

# Fine-tune and compare models
results_df = fine_tune_multiple_models(models, dataset_dir, base_output_dir, params)
print(results_df)


# 5

In [None]:
from scripts.modeling.model_interpretability import explain_predictions

# Define the label mapping
label_map = {
    0: "O", 1: "B-PRICE", 2: "I-PRICE", 3: "B-LOC", 
    4: "I-LOC", 5: "B-PRODUCT", 6: "I-PRODUCT"
}

# Fine-tuned model and dataset path
model_name = "resources/models/comparison/xlm-roberta-base"
dataset_path = "resources/data/validation_data.txt"
example_idx = 5  # Index of the example to analyze

# Generate explanation
explain_predictions(model_name, dataset_path, example_idx, label_map)
