# LoRA with Hugging Models Approach

## Libraries

In [None]:
COLAB =True # IF YOU USE GOOGLE COLAB -> COLAB = True
PIP = True # IF YOU NEED INSTALL LIBRARIES -> PIP = True

if PIP:
    !pip install transformers --upgrade
    !pip install datasets accelerate
    !pip install evaluate
    !pip install -U PyEvALL

!pip install torch
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install -U optuna

In [None]:
# Standard libraries
import os
import sys
import tempfile
import time
import ast
import json
import random

# Data manipulation
import numpy as np
import pandas as pd

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import LoraConfig, get_peft_model, TaskType

# Evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Optuna for hyperparameter tuning
import optuna

# PyEvALL for evaluation
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils


## Drive and Dataset

In [None]:
from pathlib import Path

if COLAB is True:
  from google.colab import drive
  drive.mount('/content/drive',force_remount=True)
  base_path = "/content/drive/MyDrive/EXISTS2025_TweetBusters"
  library_path = base_path + "/Functions"
else:
  base_path = Path.cwd().parent
  library_path = base_path / "Functions"



sys.path.insert(0, str(library_path))
from readerEXIST2025_2 import EXISTReader

In [None]:
# path to the dataset, adapt this path wherever you have the dataset
dataset_path = os.path.join(base_path, "Dataset/EXIST_2025_Dataset_V0.3/")

file_train = os.path.join(dataset_path, "EXIST2025_training.json")
file_dev = os.path.join(dataset_path, "EXIST2025_dev.json")
file_test = os.path.join(dataset_path, "EXIST2025_test_clean.json")


reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)
reader_test = EXISTReader(file_test)


EnTrainTask1, EnDevTask1, EnTestTask1 = reader_train.get(lang="EN", subtask="1"), reader_dev.get(lang="EN", subtask="1"), reader_test.get(lang="EN", subtask="1")
EnTrainTask2, EnDevTask2, EnTestTask2 = reader_train.get(lang="EN", subtask="2"), reader_dev.get(lang="EN", subtask="2"), reader_test.get(lang="EN", subtask="2")
EnTrainTask3, EnDevTask3, EnTestTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3"), reader_test.get(lang="EN", subtask="3")


SpTrainTask1, SpDevTask1, SpTestTask1  = reader_train.get(lang="ES", subtask="1"), reader_dev.get(lang="ES", subtask="1"), reader_test.get(lang="ES", subtask="1")
SpTrainTask2, SpDevTask2, SpTestTask2  = reader_train.get(lang="ES", subtask="2"), reader_dev.get(lang="ES", subtask="2"), reader_test.get(lang="ES", subtask="2")
SpTrainTask3, SpDevTask3, SpTestTask3  = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3"), reader_test.get(lang="ES", subtask="3")


## Import Code Functions

In [None]:
import os
import importlib.util
import sys
import inspect

functions_path = library_path

for filename in os.listdir(functions_path):
    if filename.endswith(".py") and not filename.startswith("__"):
        module_name = filename[:-3]
        file_path = os.path.join(functions_path, filename)

        # Cargar el módulo
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)

        # Extraer todas las funciones del módulo y cargarlas al espacio global
        for name, func in inspect.getmembers(module, inspect.isfunction):
            globals()[name] = func  # o locals()[name] si estás dentro de una función

## Seeding

In [None]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)


## Exploratory Model Selection

### Task 1

In [None]:
import numpy as np
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

model_names = [
    "cardiffnlp/twitter-roberta-base-2022-154m",
    "cardiffnlp/twitter-roberta-large-2022-154m",
    "cardiffnlp/twitter-xlm-roberta-base",
    "cardiffnlp/twitter-roberta-base",
    "sdadas/xlm-roberta-large-twitter",
    "g8a9/distilroberta-base-twitter-16M_aug-oct22",
    "andrea-t94/roberta-fine-tuned-twitter",
    "bdotloh/twitter-roberta-base-finetuned-twitter-user-desc"
]

params_twitter_roberta = {
    "num_train_epochs": 100,
    "learning_rate": 0.001,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "logging_steps": 100,
}

# Suponiendo que ya tienes EnTrainTask2, EnDevTask2 definidos:
df_metrics = run_lora_experiments(
    task_num=1,
    model_names=model_names,
    params=params_twitter_roberta,
    trainInfo=EnTrainTask1,
    devInfo=EnDevTask1,
    testInfo=None
)

print(df_metrics)
best_model_name = select_best_model(df_metrics, 1)



In [None]:
final_model, final_metrics = sexism_classification_pipeline_task1_LoRA(
    EnTrainTask1, EnDevTask1, None,
    best_model_name, 2, "single_label_classification",
    **optimized
)

### Task 2

In [None]:
import numpy as np
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

model_names = [
    "cardiffnlp/twitter-roberta-base-2022-154m",
    "cardiffnlp/twitter-roberta-large-2022-154m",
    "cardiffnlp/twitter-xlm-roberta-base",
    "cardiffnlp/twitter-roberta-base",
    "sdadas/xlm-roberta-large-twitter",
    "g8a9/distilroberta-base-twitter-16M_aug-oct22",
    "andrea-t94/roberta-fine-tuned-twitter",
    "bdotloh/twitter-roberta-base-finetuned-twitter-user-desc"
]

params_twitter_roberta = {
    "num_train_epochs": 100,
    "learning_rate": 0.001,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "logging_steps": 100,
}

# Suponiendo que ya tienes EnTrainTask2, EnDevTask2 definidos:
df_metrics = run_lora_experiments(
    task_num=2,
    model_names=model_names,
    params=params_twitter_roberta,
    trainInfo=EnTrainTask2,
    devInfo=EnDevTask2,
    testInfo=None
)


print(df_metrics)
best_model_name = select_best_model(df_metrics, 2)
print(f"→ Modelo ganador: {best_model_name}")



In [None]:
import numpy as np
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

params_twitter_roberta = {
    "num_train_epochs": 100,
    "learning_rate": 0.001,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "logging_steps": 100,
}

best_model_name = "cardiffnlp/twitter-roberta-base"

optimized_2 = optimize_lora_hyperparams(
    task_num=2,
    best_model_name=best_model_name,
    params_base=params_twitter_roberta,
    trainInfo=EnTrainTask2,
    devInfo=EnDevTask2,
    n_trials=20
)


print(optimized_2)
# [I 2025-05-13 18:08:16,826] Trial 4 finished with value: 0.5675971449024997 and parameters: {'learning_rate': 0.0003822078857884255, 'r': 11, 'lora_alpha': 29}. Best is trial 4 with value: 0.5675971449024997.
#[I 2025-05-13 20:24:40,021] Trial 4 finished with value: 0.5597586424127885 and parameters: {'learning_rate': 9.1657096479339e-05, 'r': 8, 'lora_alpha': 59}. Best is trial 4 with value: 0.5597586424127885.


# prompt: #[I 2025-05-13 20:24:40,021] Trial 4 finished with value: 0.5597586424127885 and parameters: {'learning_rate': 9.1657096479339e-05, 'r': 8, 'lora_alpha': 59}. Best is trial 4 with value: 0.5597586424127885.
# convierte junto con params twitter  aun diccionario llamado optimized_2, tienes que inlcuir los parms twitter tambie

optimized_2 = {
    'num_train_epochs': 100,
    'learning_rate': 9.1657096479339e-05,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'logging_steps': 100,
    'r': 8,
    'lora_alpha': 59
}


In [None]:
final_model_2, final_metrics_2 = sexism_classification_pipeline_task2_LoRA(
    EnTrainTask2, EnDevTask2, None,
    best_model_name, 4, "single_label_classification",
    **optimized_2
)


In [None]:
# prompt: haz un pipeline para la seleccion de un mejor modelo para la task3 probando difernetes repositorios de transformers de hugging face

from functools import partial
from sklearn.preprocessing import MultiLabelBinarizer

class SexismDatasetMulti(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)}
# Ejemplo de uso para task 3:

model_names = [
    "cardiffnlp/twitter-roberta-base-2022-154m",
    "cardiffnlp/twitter-roberta-large-2022-154m",
    "cardiffnlp/twitter-xlm-roberta-base",
    "cardiffnlp/twitter-roberta-base",
    "sdadas/xlm-roberta-large-twitter",
    "g8a9/distilroberta-base-twitter-16M_aug-oct22",
    "andrea-t94/roberta-fine-tuned-twitter",
    "bdotloh/twitter-roberta-base-finetuned-twitter-user-desc"
]

params_task3 = {
    "num_train_epochs": 10,  # Ajusta según sea necesario
    "learning_rate": 5e-5,   # Ajusta según sea necesario
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 16,
    "logging_steps": 10,
    "early_stopping_patience": 3
}


df_metrics_task3 = run_lora_experiments(
    task_num=3,
    model_names=model_names,
    params=params_task3,
    trainInfo=EnTrainTask3,
    devInfo=EnDevTask3,
    testInfo=None  # Configura testInfo si tienes datos de test
)

print(df_metrics_task3)

best_model_task3 = select_best_model(df_metrics_task3, 3)
print(f"→ Best model for task 3: {best_model_task3}")


In [None]:

optimized_task3 = optimize_lora_hyperparams(
    task_num=3,
    best_model_name=best_model_task3,
    params_base=params_task3,
    trainInfo=EnTrainTask3,
    devInfo=EnDevTask3,
    n_trials=5 # Reduce el número de trials para pruebas
)

print(optimized_task3)

final_model_task3, final_metrics_task3 = sexism_classification_pipeline_task3_LoRA(
    EnTrainTask3, EnDevTask3, None,
    best_model_task3, 5, "multi_label_classification",
    **optimized_task3
)