# Project EXISTS 2025

## Libraries

In [None]:
COLAB =True # IF YOU USE GOOGLE COLAB -> COLAB = True
PIP = True # IF YOU NEED INSTALL LIBRARIES -> PIP = True

if PIP:
    !pip install transformers --upgrade
    !pip install datasets accelerate
    !pip install evaluate
    !pip install -U PyEvALL

!pip install torch
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install -U optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
# Standard libraries
import os
import sys
import tempfile
import time
import ast
import json
import random

# Data manipulation
import numpy as np
import pandas as pd

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import LoraConfig, get_peft_model, TaskType

# Evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Optuna for hyperparameter tuning
import optuna

# PyEvALL for evaluation
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils


## Drive and Preloaded functions



In [None]:
if COLAB is True:
  from google.colab import drive
  drive.mount('/content/drive')
  base_path = "/content/drive/MyDrive"
else:
  base_path = ".."
base_path

library_path = base_path
sys.path.append(library_path)
from readerEXIST2025_2 import EXISTReader

Mounted at /content/drive


In [None]:
# path to the dataset, adapt this path wherever you have the dataset
dataset_path = os.path.join(base_path, "EXIST_2025_Dataset_V0.3/")

file_train = os.path.join(dataset_path, "EXIST2025_training.json")
file_dev = os.path.join(dataset_path, "EXIST2025_dev.json")
file_test = os.path.join(dataset_path, "EXIST2025_test_clean.json")


reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)
reader_test = EXISTReader(file_test)


EnTrainTask1, EnDevTask1, EnTestTask1 = reader_train.get(lang="EN", subtask="1"), reader_dev.get(lang="EN", subtask="1"), reader_test.get(lang="EN", subtask="1")
EnTrainTask2, EnDevTask2, EnTestTask2 = reader_train.get(lang="EN", subtask="2"), reader_dev.get(lang="EN", subtask="2"), reader_test.get(lang="EN", subtask="2")
EnTrainTask3, EnDevTask3, EnTestTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3"), reader_test.get(lang="EN", subtask="3")


SpTrainTask1, SpDevTask1, SpTestTask1  = reader_train.get(lang="ES", subtask="1"), reader_dev.get(lang="ES", subtask="1"), reader_test.get(lang="ES", subtask="1")
SpTrainTask2, SpDevTask2, SpTestTask2  = reader_train.get(lang="ES", subtask="2"), reader_dev.get(lang="ES", subtask="2"), reader_test.get(lang="ES", subtask="2")
SpTrainTask3, SpDevTask3, SpTestTask3  = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3"), reader_test.get(lang="ES", subtask="3")

!ls ../../corpora/EXIST_2025_Dataset_V0.3/

ls: cannot access '../../corpora/EXIST_2025_Dataset_V0.3/': No such file or directory


## Seeding

In [None]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)


## Clases and Functions

In [None]:
class SexismDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)
        }

### Evaluation Metrics

In [None]:
def compute_metrics_1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def compute_metrics_2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def compute_metrics_3(pred, lencoder):
    labels = pred.label_ids
    #preds = pred.predictions.argmax(-1)
    preds = torch.sigmoid(torch.tensor(pred.predictions)).numpy()
    preds_binary = (preds >= 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds_binary, average=None, zero_division=0
    )
    acc = accuracy_score(labels, preds_binary)
    icm= ICMWrapper(lencoder.inverse_transform(preds_binary), lencoder.inverse_transform(labels), multi=True)
    # Macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    metrics = {}
    metrics.update({
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'ICM': icm
    })
    return metrics

def ICMWrapper(pred, labels, multi=False,ids=None):
    test = PyEvALLEvaluation()
    metrics=[MetricFactory.ICM.value]
    params= dict()
    fillLabel=None
    if multi:
        params[PyEvALLUtils.PARAM_REPORT]="embedded"
        hierarchy={"True":['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE'],
        "False":[]}
        params[PyEvALLUtils.PARAM_HIERARCHY]=hierarchy
        fillLabel = lambda x: ["False"] if len(x)== 0 else x
    else:
        params[PyEvALLUtils.PARAM_REPORT]="simple"
        fillLabel = lambda x: str(x)


    truth_name, predict_name=None, None
    if ids is None:
        ids=list(range(len(labels)))

    with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as truth:
        truth_name=truth.name
        truth_df=pd.DataFrame({'test_case': ['EXIST2025']*len(labels),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in labels]})
        if multi==True:
            truth_df=truth_df.astype('object')
        truth.write(truth_df.to_json(orient="records"))

    with  tempfile.NamedTemporaryFile(mode='w', delete=False) as predict:
        predict_name=predict.name
        predict_df=pd.DataFrame({'test_case': ['EXIST2025']*len(pred),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in pred]})
        if multi==True:
            predict_df=predict_df.astype('object')
        predict.write(predict_df.to_json(orient="records"))

    report = test.evaluate(predict_name, truth_name, metrics, **params)
    os.unlink(truth_name)
    os.unlink(predict_name)

    icm = None
    if 'metrics' in report.report:
        if 'ICM' in report.report["metrics"]: icm=float(report.report["metrics"]['ICM']["results"]["average_per_test_case"])
    return icm


In [None]:
######################################CHANGE###############################################
from peft import LoraConfig, get_peft_model, TaskType
###########################################################################################

def sexism_classification_pipeline_task1_LoRA(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=2, ptype="single_label_classification", **args):
    # Model and Tokenizer
    labelEnc = LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype
    )

    ######################################CHANGE###############################################
    # Configure LoRA
    lora_config = LoraConfig(
    task_type= args.get("task_type", TaskType.SEQ_CLS),
    target_modules= args.get("target_modules", ["query", "value"]),
    r= args.get("rank", 64),  # Rank of LoRA adaptation
    lora_alpha=args.get("lora_alpha", 32),  # Scaling factor
    lora_dropout=args.get("lora_dropout", 0.1),
    bias=args.get("bias", "none")
)
    ###########################################################################################

    ######################################CHANGE###############################################
    # Prepare LoRA model
    peft_model = get_peft_model(model, lora_config)

    ###########################################################################################
    # Prepare datasets
    train_dataset = SexismDataset(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDataset(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results_task1_LoRA0'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        ######################################CHANGE###############################################
        # Prepare LoRA model
        model=peft_model,
        ###########################################################################################
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_1,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    ######################################CHANGE###############################################
    #Saving the new weigths for the LoRA model
    trainer.save_model('./final_best_model_LoRA')
    # Notice that, in this case only the LoRA matrices are saved.
    # The weigths for the classification head are not saved.
    ###########################################################################################

    ######################################CHANGE###############################################
    #Mixing the LoRA matrices with the weigths of the base model used
    mixModel=peft_model.merge_and_unload()
    mixModel.save_pretrained("./final_best_model_mixpeft")
    # IN this case the full model is saved.
    ###########################################################################################

    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(testInfo[1], [0] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            "test_case": ["EXIST2025"]*len(predicted_labels)
        })
        submission_df.to_csv('sexism_predictions_task1.csv', index=False)
        print("Prediction for TASK 1 completed. Results saved to sexism_predictions_task1.csv")
        return mixModel, submission_df
    return mixModel, eval_results



def sexism_classification_pipeline_task2_LoRA(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=4, ptype="single_label_classification", **args):
    # Model and Tokenizer
    labelEnc = LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype
    )
    ######################################CHANGE###############################################
    # Configure LoRA
    lora_config = LoraConfig(
    task_type= args.get("task_type", TaskType.SEQ_CLS),
    target_modules= args.get("target_modules", ["query", "value"]),
    r= args.get("rank", 64),  # Rank of LoRA adaptation
    lora_alpha=args.get("lora_alpha", 32),  # Scaling factor
    lora_dropout=args.get("lora_dropout", 0.1),
    bias=args.get("bias", "none"),
)
    ###########################################################################################

    ######################################CHANGE###############################################
    # Prepare LoRA model
    peft_model = get_peft_model(model, lora_config)

    ###########################################################################################

    # Prepare datasets
    train_dataset = SexismDataset(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDataset(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results_task2_LoRA0'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"f1") # F1 para el concurso
    )

    # Initialize Trainer
    trainer = Trainer(
        ######################################CHANGE###############################################
        # Prepare LoRA model
        model=peft_model,
        ###########################################################################################
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_2,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    ######################################CHANGE###############################################
    #Saving the new weigths for the LoRA model
    trainer.save_model('./final_best_model_LoRA_2')
    # Notice that, in this case only the LoRA matrices are saved.
    # The weigths for the classification head are not saved.
    ###########################################################################################

    ######################################CHANGE###############################################
    #Mixing the LoRA matrices with the weigths of the base model used
    mixModel=peft_model.merge_and_unload()
    mixModel.save_pretrained("./final_best_model_mixpeft_2")
    # IN this case the full model is saved.
    ###########################################################################################

    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(testInfo[1], [0] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            "test_case": ["EXIST2025"]*len(predicted_labels)
        })
        submission_df.to_csv('sexism_predictions_task2.csv', index=False)
        print("Prediction for TASK 2 completed. Results saved to sexism_predictions_task1.csv")
        return mixModel, submission_df
    return mixModel, eval_results


def sexism_classification_pipeline_task3_LoRA(trainInfo, devInfo, testInfo=None,
                                              model_name='roberta-base',
                                              nlabels=6,
                                              ptype="multi_label_classification",
                                              **args):
    """
    Pipeline para clasificación multi-label (TASK 3) con adaptación LoRA.

    Parámetros:
      - trainInfo, devInfo, testInfo: estructuras con información, donde se espera que:
          trainInfo = (lista_ids, lista_textos, lista_etiquetas)
          devInfo   = (lista_ids, lista_textos, lista_etiquetas)
          testInfo  = (lista_ids, lista_textos, _dummy_)  [para test se generan etiquetas dummy]
      - model_name: nombre (o path) del modelo base.
      - nlabels: número de etiquetas.
      - ptype: tipo de problema para el modelo ('multi_label_classification').
      - **args: permite ajustar hiperparámetros (por ejemplo, learning_rate, num_train_epochs, etc.) y la configuración LoRA.

    Retorna:
      - mixModel: modelo final (con LoRA fusionado con los pesos base).
      - submission_df o eval_results: según se proporcione o no testInfo.
    """
    # Codificador de etiquetas para multi-label
    labelEnc = MultiLabelBinarizer()


    # Modelo y Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype
    )

    ###################################### CONFIGURACIÓN LoRA ###############################################
    lora_config = LoraConfig(
        task_type=args.get("task_type", TaskType.SEQ_CLS),
        target_modules=args.get("target_modules", ["query", "value"]),
        r=args.get("rank", 64),                # Rango de adaptación LoRA
        lora_alpha=args.get("lora_alpha", 32),  # Factor de escalado LoRA
        lora_dropout=args.get("lora_dropout", 0.1),
        bias=args.get("bias", "none")
    )
    # Aplicar LoRA al modelo base
    peft_model = get_peft_model(base_model, lora_config)
    #######################################################################################################

    # Preparación de datasets usando SexismDatasetMulti para clasificación multi-label
    train_dataset = SexismDatasetMulti(
        texts=trainInfo[1],
        labels=labelEnc.fit_transform(trainInfo[2]),
        ids=[int(x) for x in trainInfo[0]],
        tokenizer=tokenizer
    )
    val_dataset = SexismDatasetMulti(
        texts=devInfo[1],
        labels=labelEnc.transform(devInfo[2]),
        ids=[int(x) for x in devInfo[0]],
        tokenizer=tokenizer
    )

    # Configuración de los argumentos de entrenamiento
    training_args = TrainingArguments(
        report_to="none",  # Alternativas: "wandb", "tensorboard", etc.
        output_dir=args.get('output_dir', './results_task3_LoRA'),
        num_train_epochs=args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay', 0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy', 'epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model', "ICM")
    )

    # Inicialización del Trainer con función de métricas propia para multi-label (compute_metrics_3)
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience", 3))]
    )

    # Entrenamiento
    trainer.train()

    # Evaluación en el conjunto de validación
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    ###################################### GUARDADO DEL MODELO ###############################################
    # Se guardan los pesos de LoRA (solo las matrices LoRA)
    trainer.save_model('./final_best_model_LoRA')
    # Se realiza la fusión de las matrices LoRA con los pesos base para obtener el modelo completo
    mixModel = peft_model.merge_and_unload()
    mixModel.save_pretrained("./final_best_model_mixpeft_3")
    #######################################################################################################

    if testInfo is not None:
        # Preparación del dataset de test (se crean etiquetas dummy, pues en test solo se hacen predicciones)
        test_dataset = SexismDatasetMulti(
            texts=testInfo[1],
            labels=[[0] * nlabels for _ in range(len(testInfo[1]))],
            ids=[int(x) for x in testInfo[0]],
            tokenizer=tokenizer
        )

        # Realizar predicciones
        predictions = trainer.predict(test_dataset)
        # Se aplican la función sigmoide y se define un umbral (0.5) para obtener las etiquetas finales
        predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
        predicted_labels = (predicted_probs >= 0.5).astype(int)

        # Creación del DataFrame para la submission
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            'test_case': ["EXIST2025"] * len(predicted_labels)
        })
        submission_df.to_csv('sexism_predictions_task3_LoRA.csv', index=False)
        print("Prediction for TASK 3 (LoRA) completed. Results saved to sexism_predictions_task3_LoRA.csv")
        return mixModel, submission_df

    return mixModel, eval_results

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

def run_lora_experiments(
    task_num: int,
    model_names: list,
    params: dict,
    trainInfo,
    devInfo,
    testInfo=None
) -> pd.DataFrame:
    """
    Runs a batch of LoRA fine-tuning experiments for task 1, 2 or 3.

    Args:
        task_num:       1, 2 or 3 – which sexism_classification_pipeline_taskX_LoRA to call.
        model_names:    List of HF model names (strings).
        params:         Dict of hyperparameters to pass as **kwargs.
        trainInfo:      Your train tuple (ids, texts, labels).
        devInfo:        Your dev tuple (ids, texts, labels).
        testInfo:       Optional test tuple (ids, texts) for predictions.

    Returns:
        DataFrame with one row per model, columns = ['model', *eval_metrics..., 'epoch'].
    """
    # map task number → (pipeline_fn, nlabels, problem_type)
    pipeline_map = {
        1: (sexism_classification_pipeline_task1_LoRA, 2, "single_label_classification"),
        2: (sexism_classification_pipeline_task2_LoRA, 4, "single_label_classification"),
        3: (sexism_classification_pipeline_task3_LoRA, 6, "multi_label_classification"),
    }
    if task_num not in pipeline_map:
        raise ValueError(f"Unsupported task {task_num}. Choose 1, 2 or 3.")

    pipeline_fn, nlabels, ptype = pipeline_map[task_num]
    metrics_list = []

    for model_name in model_names:
        print(f"→ Running task{task_num} LoRA on {model_name!r} …")
        # pipeline returns (model, eval_results) if testInfo is None
        _, eval_results = pipeline_fn(
            trainInfo,
            devInfo,
            testInfo,
            model_name,
            nlabels,
            ptype,
            **params
        )
        # eval_results is a dict like {'eval_accuracy':…, 'eval_f1':…, …, 'epoch':…}
        row = {"model": model_name}
        row.update(eval_results)
        metrics_list.append(row)

    df = pd.DataFrame(metrics_list)
    # optional: reorder cols so 'model' comes first
    cols = ["model"] + [c for c in df.columns if c != "model"]
    return df[cols]


def select_best_model(df: pd.DataFrame, task_num: int) -> str:
    """
    Given the DataFrame of metrics (as returned by run_lora_experiments),
    computes a composite 'score' depending on task and returns the best model name.
    """
    if task_num == 1:
        df["score"] = 0.6 * df["eval_f1"] + 0.4 * df["eval_accuracy"]
    elif task_num == 2:
        df["score"] = (
            0.5 * df["eval_f1"] +
            0.3 * df["eval_accuracy"] +
            0.2 * df["eval_precision"]
        )
    elif task_num == 3:
        df["score"] = (
            0.4 * df["eval_micro_f1"] +
            0.4 * df["eval_macro_f1"] +
            0.2 * df["eval_subset_accuracy"]
        )
    else:
        raise ValueError(f"Unknown task {task_num}")

    best_row = df.sort_values("score", ascending=False).iloc[0]
    print(f"→ Winning model for task{task_num}: {best_row['model']} (score={best_row['score']:.4f})")
    return best_row["model"]


def optimize_lora_hyperparams(
    task_num: int,
    best_model_name: str,
    params_base: dict,
    trainInfo,
    devInfo,
    n_trials: int = 20
) -> dict:
    """
    Runs an Optuna search to tune LoRA hyperparameters for the given task
    and model checkpoint. Returns the optimized params dict.

    Args:
        task_num:         1, 2 or 3 – selects number of labels & compute_metrics.
        best_model_name:  HF model checkpoint chosen after initial comparison.
        params_base:      Base params dict (num_train_epochs, batch_sizes, etc.).
        trainInfo:        Tuple (ids, texts, labels) for training.
        devInfo:          Tuple (ids, texts, labels) for validation.
        n_trials:         Number of Optuna trials to run.
    """
    # 1) Map task → (nlabels, compute_metrics_fn)
    compute_map = {
        1: (2, compute_metrics_1),
        2: (4, compute_metrics_2),
        3: (6, compute_metrics_3),
    }
    if task_num not in compute_map:
        raise ValueError("task_num must be 1, 2, or 3")
    nlabels, compute_metrics = compute_map[task_num]

    # 2) Prepare tokenizer, label encoder, and PyTorch datasets
    tokenizer = AutoTokenizer.from_pretrained(best_model_name)
    label_enc = LabelEncoder()
    y_train = label_enc.fit_transform(trainInfo[2])
    y_dev   = label_enc.transform(devInfo[2])

    train_dataset = SexismDataset(
        texts=trainInfo[1],
        labels=y_train,
        ids=[int(x) for x in trainInfo[0]],
        tokenizer=tokenizer
    )
    eval_dataset = SexismDataset(
        texts=devInfo[1],
        labels=y_dev,
        ids=[int(x) for x in devInfo[0]],
        tokenizer=tokenizer
    )

    # 3) Define Optuna objective
    def objective(trial):
        # 3.1) Suggest hyperparameters
        lr         = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
        r          = trial.suggest_int("r", 8, 128, log=True)
        lora_alpha = trial.suggest_int("lora_alpha", 8, 64, log=True)

        # 3.2) Load base model & apply LoRA
        base_model = AutoModelForSequenceClassification.from_pretrained(
            best_model_name, num_labels=nlabels
        )
        lora_cfg = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            target_modules=["query", "value"],
            r=r,
            lora_alpha=lora_alpha,
            lora_dropout=0.1,
            bias="none"
        )
        peft_model = get_peft_model(base_model, lora_cfg)

        # 3.3) Quick-training arguments
        args = TrainingArguments(
            output_dir=f"optuna_trial_{trial.number}",
            num_train_epochs=10,
            per_device_train_batch_size=params_base["per_device_train_batch_size"],
            per_device_eval_batch_size=params_base["per_device_eval_batch_size"],
            learning_rate=lr,
            logging_steps=params_base.get("logging_steps", 100),
            eval_strategy="epoch",
            save_strategy="no",
            load_best_model_at_end=False,
            report_to="none"
        )

        # 3.4) Trainer
        trainer = Trainer(
            model=peft_model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics
        )

        # 3.5) Train & evaluate
        trainer.train()
        res = trainer.evaluate()
        return res["eval_f1"]

    # 4) Run Optuna study
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    print("✅ Optimized LoRA hyperparameters:", best_params)

    # 5) Merge with base params and return
    optimized_params = params_base.copy()
    optimized_params.update(best_params)
    return optimized_params


set_seed()


# Model Evaluation

# Task 1

In [None]:
import numpy as np
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

model_names = [
    "cardiffnlp/twitter-roberta-base-2022-154m",
    "cardiffnlp/twitter-roberta-large-2022-154m",
    "cardiffnlp/twitter-xlm-roberta-base",
    "cardiffnlp/twitter-roberta-base",
    "sdadas/xlm-roberta-large-twitter",
    "g8a9/distilroberta-base-twitter-16M_aug-oct22",
    "andrea-t94/roberta-fine-tuned-twitter",
    "bdotloh/twitter-roberta-base-finetuned-twitter-user-desc"
]

params_twitter_roberta = {
    "num_train_epochs": 100,
    "learning_rate": 0.001,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "logging_steps": 100,
}

# Suponiendo que ya tienes EnTrainTask2, EnDevTask2 definidos:
df_metrics = run_lora_experiments(
    task_num=1,
    model_names=model_names,
    params=params_twitter_roberta,
    trainInfo=EnTrainTask1,
    devInfo=EnDevTask1,
    testInfo=None
)

print(df_metrics)
best_model_name = select_best_model(df_metrics, 1)



→ Running task1 LoRA on 'cardiffnlp/twitter-roberta-base-2022-154m' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2022-154m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4748,0.376046,0.84009,0.825553,0.788732,0.865979
2,0.4559,0.370179,0.81982,0.809524,0.752212,0.876289
3,0.4005,0.529913,0.842342,0.830097,0.784404,0.881443
4,0.3622,0.449392,0.846847,0.822917,0.831579,0.814433
5,0.3229,0.567762,0.86036,0.835979,0.858696,0.814433
6,0.3863,0.650543,0.824324,0.782123,0.853659,0.721649
7,0.3047,0.864531,0.806306,0.754286,0.846154,0.680412
8,0.2887,0.920907,0.849099,0.83208,0.809756,0.85567


Validation Results: {'eval_loss': 0.5677616000175476, 'eval_accuracy': 0.8603603603603603, 'eval_f1': 0.8359788359788359, 'eval_precision': 0.8586956521739131, 'eval_recall': 0.8144329896907216, 'eval_runtime': 3.1745, 'eval_samples_per_second': 139.866, 'eval_steps_per_second': 17.641, 'epoch': 8.0}
→ Running task1 LoRA on 'cardiffnlp/twitter-roberta-large-2022-154m' …


tokenizer_config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-large-2022-154m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5405,0.630194,0.704955,0.50566,0.943662,0.345361
2,0.7387,0.694174,0.436937,0.60815,0.436937,1.0
3,0.7116,0.702667,0.436937,0.60815,0.436937,1.0
4,0.7196,0.685708,0.563063,0.0,0.0,0.0
5,0.7064,0.704488,0.563063,0.0,0.0,0.0


Validation Results: {'eval_loss': 0.6941737532615662, 'eval_accuracy': 0.4369369369369369, 'eval_f1': 0.6081504702194357, 'eval_precision': 0.4369369369369369, 'eval_recall': 1.0, 'eval_runtime': 10.4229, 'eval_samples_per_second': 42.599, 'eval_steps_per_second': 5.373, 'epoch': 5.0}
→ Running task1 LoRA on 'cardiffnlp/twitter-xlm-roberta-base' …


config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.506,0.505598,0.790541,0.794702,0.694981,0.927835
2,0.4891,0.407931,0.822072,0.797954,0.791878,0.804124
3,0.4425,0.408927,0.842342,0.820513,0.816327,0.824742
4,0.4197,0.618279,0.77027,0.703488,0.806667,0.623711
5,0.3614,0.548644,0.837838,0.818182,0.80198,0.835052
6,0.4058,0.507938,0.842342,0.818653,0.822917,0.814433


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Validation Results: {'eval_loss': 0.40892738103866577, 'eval_accuracy': 0.8423423423423423, 'eval_f1': 0.8205128205128205, 'eval_precision': 0.8163265306122449, 'eval_recall': 0.8247422680412371, 'eval_runtime': 3.2099, 'eval_samples_per_second': 138.324, 'eval_steps_per_second': 17.446, 'epoch': 6.0}
→ Running task1 LoRA on 'cardiffnlp/twitter-roberta-base' …


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.488,0.378719,0.822072,0.814118,0.748918,0.891753
2,0.4703,0.338744,0.846847,0.827411,0.815,0.840206
3,0.4074,0.38838,0.851351,0.820652,0.867816,0.778351
4,0.3447,0.425506,0.849099,0.828645,0.822335,0.835052
5,0.3393,0.780859,0.842342,0.80226,0.8875,0.731959
6,0.2878,0.442068,0.86036,0.845771,0.817308,0.876289
7,0.2899,0.723661,0.826577,0.778098,0.882353,0.695876
8,0.2512,0.745471,0.844595,0.810959,0.865497,0.762887
9,0.3182,0.607236,0.84009,0.825553,0.788732,0.865979


Validation Results: {'eval_loss': 0.442067950963974, 'eval_accuracy': 0.8603603603603603, 'eval_f1': 0.845771144278607, 'eval_precision': 0.8173076923076923, 'eval_recall': 0.8762886597938144, 'eval_runtime': 3.2464, 'eval_samples_per_second': 136.767, 'eval_steps_per_second': 17.25, 'epoch': 9.0}
→ Running task1 LoRA on 'sdadas/xlm-roberta-large-twitter' …


tokenizer_config.json:   0%|          | 0.00/469 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at sdadas/xlm-roberta-large-twitter and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5612,0.456708,0.853604,0.826667,0.856354,0.798969
2,0.7729,0.84609,0.563063,0.0,0.0,0.0
3,0.7441,0.862655,0.563063,0.0,0.0,0.0
4,0.7794,0.726965,0.436937,0.60815,0.436937,1.0


Validation Results: {'eval_loss': 0.4567076861858368, 'eval_accuracy': 0.8536036036036037, 'eval_f1': 0.8266666666666667, 'eval_precision': 0.856353591160221, 'eval_recall': 0.7989690721649485, 'eval_runtime': 10.6596, 'eval_samples_per_second': 41.653, 'eval_steps_per_second': 5.253, 'epoch': 4.0}
→ Running task1 LoRA on 'g8a9/distilroberta-base-twitter-16M_aug-oct22' …


tokenizer_config.json:   0%|          | 0.00/386 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at g8a9/distilroberta-base-twitter-16M_aug-oct22 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4891,0.416778,0.833333,0.822115,0.77027,0.881443
2,0.4492,0.439379,0.837838,0.795455,0.886076,0.721649
3,0.3968,0.606224,0.837838,0.805405,0.846591,0.768041
4,0.3621,0.453194,0.837838,0.820896,0.793269,0.850515


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

Validation Results: {'eval_loss': 0.41677796840667725, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.8221153846153846, 'eval_precision': 0.7702702702702703, 'eval_recall': 0.8814432989690721, 'eval_runtime': 1.6868, 'eval_samples_per_second': 263.22, 'eval_steps_per_second': 33.199, 'epoch': 4.0}
→ Running task1 LoRA on 'andrea-t94/roberta-fine-tuned-twitter' …


tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at andrea-t94/roberta-fine-tuned-twitter and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5022,0.392121,0.822072,0.795866,0.797927,0.793814
2,0.4726,0.538857,0.826577,0.780627,0.872611,0.706186
3,0.4438,0.537737,0.81982,0.761905,0.901408,0.659794
4,0.4148,0.419617,0.851351,0.835821,0.807692,0.865979
5,0.3458,0.542687,0.81982,0.808612,0.754464,0.871134
6,0.3194,0.466222,0.817568,0.796992,0.77561,0.819588
7,0.3507,0.654132,0.826577,0.806045,0.788177,0.824742


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

Validation Results: {'eval_loss': 0.4196171760559082, 'eval_accuracy': 0.8513513513513513, 'eval_f1': 0.835820895522388, 'eval_precision': 0.8076923076923077, 'eval_recall': 0.865979381443299, 'eval_runtime': 1.6692, 'eval_samples_per_second': 265.995, 'eval_steps_per_second': 33.549, 'epoch': 7.0}
→ Running task1 LoRA on 'bdotloh/twitter-roberta-base-finetuned-twitter-user-desc' …


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bdotloh/twitter-roberta-base-finetuned-twitter-user-desc and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.516,0.405331,0.81982,0.811321,0.747826,0.886598
2,0.4934,0.392478,0.835586,0.817043,0.795122,0.840206
3,0.4248,0.487659,0.801802,0.744186,0.853333,0.659794
4,0.4235,0.436317,0.826577,0.794667,0.823204,0.768041
5,0.3887,0.467698,0.826577,0.812652,0.769585,0.860825


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Validation Results: {'eval_loss': 0.39247798919677734, 'eval_accuracy': 0.8355855855855856, 'eval_f1': 0.8170426065162907, 'eval_precision': 0.7951219512195122, 'eval_recall': 0.8402061855670103, 'eval_runtime': 3.2042, 'eval_samples_per_second': 138.569, 'eval_steps_per_second': 17.477, 'epoch': 5.0}
                                               model  eval_loss  \
0          cardiffnlp/twitter-roberta-base-2022-154m   0.567762   
1         cardiffnlp/twitter-roberta-large-2022-154m   0.694174   
2                cardiffnlp/twitter-xlm-roberta-base   0.408927   
3                    cardiffnlp/twitter-roberta-base   0.442068   
4                   sdadas/xlm-roberta-large-twitter   0.456708   
5      g8a9/distilroberta-base-twitter-16M_aug-oct22   0.416778   
6              andrea-t94/roberta-fine-tuned-twitter   0.419617   
7  bdotloh/twitter-roberta-base-finetuned-twitter...   0.392478   

   eval_accuracy   eval_f1  eval_precision  eval_recall  eval_runtime  \
0       0.860360  0.

In [None]:
optimized = optimize_lora_hyperparams(
    task_num=1,
    best_model_name=best_model_name,
    params_base=params_twitter_roberta,
    trainInfo=EnTrainTask1,
    devInfo=EnDevTask1,
    n_trials=20
)


print(optimized)



[I 2025-05-14 17:19:34,632] A new study created in memory with name: no-name-825b0510-c881-4733-a5f1-04c2e5a8c915
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4499,0.421917,0.826577,0.770149,0.914894,0.664948
2,0.3919,0.335344,0.862613,0.84557,0.830846,0.860825
3,0.3303,0.408464,0.846847,0.83,0.805825,0.85567
4,0.2564,0.424464,0.862613,0.840731,0.851852,0.829897
5,0.2107,0.639807,0.844595,0.809917,0.869822,0.757732
6,0.1198,0.666284,0.876126,0.865526,0.823256,0.912371
7,0.1488,0.676318,0.851351,0.829897,0.829897,0.829897
8,0.1319,0.758225,0.855856,0.835052,0.835052,0.835052
9,0.1142,0.772934,0.858108,0.838875,0.832487,0.845361
10,0.0468,0.869819,0.858108,0.837209,0.839378,0.835052


[I 2025-05-14 17:27:52,517] Trial 0 finished with value: 0.8372093023255814 and parameters: {'learning_rate': 0.0002700550492566597, 'r': 22, 'lora_alpha': 53}. Best is trial 0 with value: 0.8372093023255814.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4573,0.395979,0.808559,0.78481,0.771144,0.798969
2,0.4188,0.36605,0.828829,0.806122,0.79798,0.814433
3,0.4163,0.381596,0.826577,0.802057,0.8,0.804124
4,0.3674,0.369941,0.833333,0.8,0.840909,0.762887
5,0.3352,0.427514,0.833333,0.793296,0.865854,0.731959
6,0.3091,0.364893,0.853604,0.839506,0.805687,0.876289
7,0.3574,0.398514,0.846847,0.818182,0.85,0.78866
8,0.3136,0.398027,0.849099,0.823219,0.843243,0.804124
9,0.3072,0.394097,0.849099,0.825974,0.832461,0.819588
10,0.2584,0.397791,0.851351,0.828125,0.836842,0.819588


[I 2025-05-14 17:36:04,764] Trial 1 finished with value: 0.828125 and parameters: {'learning_rate': 6.654336600427276e-05, 'r': 8, 'lora_alpha': 25}. Best is trial 0 with value: 0.8372093023255814.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4363,0.407217,0.831081,0.778761,0.910345,0.680412
2,0.3916,0.345058,0.84009,0.822943,0.797101,0.850515
3,0.3542,0.364357,0.853604,0.835443,0.820896,0.850515
4,0.2934,0.430328,0.844595,0.814016,0.853107,0.778351
5,0.2453,0.565497,0.844595,0.815013,0.849162,0.783505
6,0.1623,0.53751,0.86036,0.845,0.820388,0.871134
7,0.2217,0.593107,0.84009,0.809651,0.843575,0.778351
8,0.1733,0.649553,0.846847,0.824742,0.824742,0.824742
9,0.1675,0.698268,0.862613,0.84557,0.830846,0.860825
10,0.1077,0.741868,0.862613,0.844784,0.834171,0.85567


[I 2025-05-14 17:44:26,227] Trial 2 finished with value: 0.8447837150127226 and parameters: {'learning_rate': 0.00016628366645058708, 'r': 44, 'lora_alpha': 49}. Best is trial 2 with value: 0.8447837150127226.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4507,0.387698,0.822072,0.788204,0.821229,0.757732
2,0.4077,0.355976,0.828829,0.806122,0.79798,0.814433
3,0.381,0.362083,0.833333,0.812183,0.8,0.824742
4,0.351,0.372619,0.842342,0.804469,0.878049,0.742268
5,0.3137,0.44093,0.837838,0.79661,0.88125,0.726804
6,0.2484,0.370945,0.853604,0.841849,0.797235,0.891753
7,0.3025,0.390123,0.846847,0.821053,0.83871,0.804124
8,0.2721,0.409639,0.844595,0.816976,0.84153,0.793814
9,0.2761,0.424456,0.849099,0.823219,0.843243,0.804124
10,0.22,0.429248,0.846847,0.822917,0.831579,0.814433


[I 2025-05-14 17:52:46,147] Trial 3 finished with value: 0.8229166666666666 and parameters: {'learning_rate': 0.00012297006704745613, 'r': 33, 'lora_alpha': 15}. Best is trial 2 with value: 0.8447837150127226.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4414,0.397454,0.831081,0.780059,0.904762,0.685567
2,0.3992,0.336007,0.828829,0.804124,0.804124,0.804124
3,0.3435,0.343514,0.855856,0.842365,0.806604,0.881443
4,0.3081,0.380762,0.844595,0.806723,0.883436,0.742268
5,0.2612,0.403489,0.844595,0.813008,0.857143,0.773196
6,0.1958,0.423681,0.869369,0.86385,0.793103,0.948454
7,0.2461,0.424023,0.851351,0.829016,0.833333,0.824742
8,0.2094,0.456115,0.853604,0.837093,0.814634,0.860825
9,0.197,0.519055,0.855856,0.835897,0.831633,0.840206
10,0.1425,0.514236,0.86036,0.84264,0.83,0.85567


[I 2025-05-14 18:01:17,731] Trial 4 finished with value: 0.8426395939086294 and parameters: {'learning_rate': 0.000266832862838293, 'r': 90, 'lora_alpha': 10}. Best is trial 2 with value: 0.8447837150127226.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4418,0.389212,0.826577,0.778098,0.882353,0.695876
2,0.4123,0.327672,0.853604,0.837093,0.814634,0.860825
3,0.3435,0.36468,0.851351,0.838235,0.799065,0.881443
4,0.2858,0.414027,0.86036,0.833333,0.870787,0.798969
5,0.2413,0.581723,0.864865,0.842105,0.860215,0.824742
6,0.1553,0.551239,0.853604,0.844125,0.789238,0.907216
7,0.1937,0.598559,0.84009,0.817481,0.815385,0.819588
8,0.1519,0.690317,0.855856,0.840796,0.8125,0.871134
9,0.1577,0.759634,0.86036,0.844221,0.823529,0.865979
10,0.1027,0.801021,0.853604,0.83376,0.827411,0.840206


[I 2025-05-14 18:09:28,173] Trial 5 finished with value: 0.8337595907928389 and parameters: {'learning_rate': 0.0004847524234001107, 'r': 8, 'lora_alpha': 10}. Best is trial 2 with value: 0.8447837150127226.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6514,0.633722,0.581081,0.079208,1.0,0.041237
2,0.4855,0.421819,0.795045,0.756032,0.787709,0.726804
3,0.4322,0.410523,0.806306,0.780612,0.772727,0.78866
4,0.4305,0.410533,0.817568,0.782842,0.815642,0.752577
5,0.3879,0.398687,0.815315,0.784211,0.801075,0.768041
6,0.3899,0.392526,0.815315,0.789744,0.785714,0.793814
7,0.4022,0.396891,0.817568,0.787402,0.802139,0.773196
8,0.396,0.393507,0.81982,0.790576,0.803191,0.778351
9,0.3769,0.390882,0.817568,0.788512,0.798942,0.778351
10,0.3589,0.392227,0.81982,0.790576,0.803191,0.778351


[I 2025-05-14 18:17:50,554] Trial 6 finished with value: 0.7905759162303665 and parameters: {'learning_rate': 1.3156046936508764e-05, 'r': 50, 'lora_alpha': 32}. Best is trial 2 with value: 0.8447837150127226.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6572,0.66348,0.563063,0.0,0.0,0.0
2,0.5745,0.512712,0.77027,0.711864,0.7875,0.649485
3,0.4742,0.426971,0.788288,0.755208,0.763158,0.747423
4,0.4618,0.423572,0.792793,0.744444,0.807229,0.690722
5,0.4273,0.410835,0.79955,0.763926,0.786885,0.742268
6,0.4247,0.404569,0.806306,0.777202,0.78125,0.773196
7,0.4312,0.406321,0.813063,0.781003,0.8,0.762887
8,0.4267,0.402441,0.806306,0.774869,0.787234,0.762887
9,0.4094,0.402166,0.808559,0.776903,0.791444,0.762887
10,0.3846,0.402187,0.810811,0.778947,0.795699,0.762887


[I 2025-05-14 18:26:03,166] Trial 7 finished with value: 0.7789473684210526 and parameters: {'learning_rate': 1.246998779453707e-05, 'r': 13, 'lora_alpha': 13}. Best is trial 2 with value: 0.8447837150127226.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4355,0.41204,0.826577,0.768769,0.920863,0.659794
2,0.4014,0.323669,0.849099,0.83208,0.809756,0.85567
3,0.331,0.35992,0.844595,0.832117,0.788018,0.881443
4,0.2844,0.391969,0.86036,0.833333,0.870787,0.798969


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4355,0.41204,0.826577,0.768769,0.920863,0.659794
2,0.4014,0.323669,0.849099,0.83208,0.809756,0.85567
3,0.331,0.35992,0.844595,0.832117,0.788018,0.881443
4,0.2844,0.391969,0.86036,0.833333,0.870787,0.798969
5,0.2327,0.534896,0.849099,0.816438,0.871345,0.768041
6,0.1719,0.538717,0.853604,0.847775,0.776824,0.93299


[W 2025-05-14 18:31:00,625] Trial 8 failed with parameters: {'learning_rate': 0.0003633966492908565, 'r': 21, 'lora_alpha': 12} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-12-e8f4e731f66e>", line 188, in objective
    trainer.train()
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2565, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
[W 2025-05-14 18:31:00,627] Trial 8 failed with value None.


KeyboardInterrupt: 

In [None]:
final_model, final_metrics = sexism_classification_pipeline_task1_LoRA(
    EnTrainTask1, EnDevTask1, None,
    best_model_name, 2, "single_label_classification",
    **optimized
)

NameError: name 'optimized' is not defined

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Simple Dataset returning id + tokenized inputs
class EnTestDataset(Dataset):
    def __init__(self, ids, texts, tokenizer, max_length=128):
        self.ids = ids
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        tweet_id = self.ids[idx]
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["id"] = tweet_id
        return item

# 2. Load tokenizer + model
modelname = best_model_name
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForSequenceClassification.from_pretrained(
    "./final_best_model_mixpeft",
    num_labels=2
)
model.eval()
model.cuda()

# 3. Prepare dataset & dataloader
ids_list, texts_list = EnTestTask1[0], EnTestTask1[1]
dataset = EnTestDataset(ids_list, texts_list, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=False)

# 4. Inference: collect both hard and soft
hard_results = []
soft_results = []
softmax = torch.nn.Softmax(dim=-1)

with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get probabilities and hard predictions BEFORE tolist()
        probs_tensor = softmax(outputs.logits)               # [B, 2] Tensor
        preds_tensor = outputs.logits.argmax(dim=-1)         # [B] Tensor

        probs = probs_tensor.cpu().tolist()                  # list of [p_no, p_yes]
        preds = preds_tensor.cpu().tolist()                  # list of 0/1 ints

        for tweet_id, prob_vec, pred in zip(batch["id"], probs, preds):
            hard_results.append({
                "test_case": "EXIST2025",
                "id": tweet_id,
                "value": "YES" if pred == 1 else "NO"
            })
            soft_results.append({
                "test_case": "EXIST2025",
                "id": tweet_id,
                "value": {
                    "NO": prob_vec[0],
                    "YES": prob_vec[1]
                }
            })

# 5. Save both JSONs
name = modelname.split("/")[-1]
hard_path = f"predictions_{name}_hard_EnTestTask1.json"
soft_path = f"predictions_{name}_soft_EnTestTask1.json"

with open(hard_path, "w", encoding="utf-8") as f:
    json.dump(hard_results, f, indent=4, ensure_ascii=False)

with open(soft_path, "w", encoding="utf-8") as f:
    json.dump(soft_results, f, indent=4, ensure_ascii=False)

print(f"Saved {len(hard_results)} hard predictions to {hard_path}")
print(f"Saved {len(soft_results)} soft predictions to {soft_path}")




# Task 2

In [None]:
import numpy as np
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

model_names = [
    "cardiffnlp/twitter-roberta-base-2022-154m",
    "cardiffnlp/twitter-roberta-large-2022-154m",
    "cardiffnlp/twitter-xlm-roberta-base",
    "cardiffnlp/twitter-roberta-base",
    "sdadas/xlm-roberta-large-twitter",
    "g8a9/distilroberta-base-twitter-16M_aug-oct22",
    "andrea-t94/roberta-fine-tuned-twitter",
    "bdotloh/twitter-roberta-base-finetuned-twitter-user-desc"
]

params_twitter_roberta = {
    "num_train_epochs": 100,
    "learning_rate": 0.001,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "logging_steps": 100,
}

# Suponiendo que ya tienes EnTrainTask2, EnDevTask2 definidos:
df_metrics = run_lora_experiments(
    task_num=2,
    model_names=model_names,
    params=params_twitter_roberta,
    trainInfo=EnTrainTask2,
    devInfo=EnDevTask2,
    testInfo=None
)


print(df_metrics)
best_model_name = select_best_model(df_metrics, 2)
print(f"→ Modelo ganador: {best_model_name}")



→ Running task2 LoRA on 'cardiffnlp/twitter-roberta-base-2022-154m' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2022-154m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7341,0.752999,0.752525,0.385108,0.35,0.433882
2,0.7064,0.840303,0.752525,0.457245,0.447533,0.480665
3,0.6821,0.729376,0.765152,0.429229,0.60535,0.446374
4,0.5781,0.74779,0.762626,0.50596,0.495163,0.552995
5,0.5224,0.72812,0.752525,0.487903,0.473701,0.535091
6,0.4341,0.834108,0.752525,0.519301,0.53231,0.516001
7,0.4082,0.849149,0.744949,0.524876,0.519016,0.53988
8,0.4143,0.842113,0.777778,0.549166,0.560222,0.54167
9,0.4797,0.984255,0.755051,0.496739,0.572351,0.494908
10,0.34,0.875209,0.767677,0.538091,0.591647,0.522589


Validation Results: {'eval_loss': 0.8421133756637573, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.5491663680739036, 'eval_precision': 0.5602221431380996, 'eval_recall': 0.5416697224344283, 'eval_runtime': 2.8623, 'eval_samples_per_second': 138.35, 'eval_steps_per_second': 17.468, 'epoch': 11.0}
→ Running task2 LoRA on 'cardiffnlp/twitter-roberta-large-2022-154m' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-large-2022-154m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7835,0.695258,0.757576,0.462503,0.551033,0.474925
2,0.9956,1.294936,0.214646,0.088358,0.053662,0.25
3,1.0246,1.063363,0.631313,0.193498,0.157828,0.25
4,0.9971,1.048707,0.631313,0.193498,0.157828,0.25


Validation Results: {'eval_loss': 0.695257842540741, 'eval_accuracy': 0.7575757575757576, 'eval_f1': 0.4625029685242861, 'eval_precision': 0.551032877996518, 'eval_recall': 0.4749246243952126, 'eval_runtime': 9.4424, 'eval_samples_per_second': 41.938, 'eval_steps_per_second': 5.295, 'epoch': 4.0}
→ Running task2 LoRA on 'cardiffnlp/twitter-xlm-roberta-base' …


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7721,0.849338,0.719697,0.350058,0.338735,0.370412
2,0.7718,0.830945,0.732323,0.456513,0.446325,0.483123
3,0.7138,0.881073,0.714646,0.339401,0.319928,0.364529
4,0.6197,1.055317,0.631313,0.193498,0.157828,0.25
5,0.9671,1.019124,0.631313,0.193498,0.157828,0.25


Validation Results: {'eval_loss': 0.8309450745582581, 'eval_accuracy': 0.7323232323232324, 'eval_f1': 0.45651276077517744, 'eval_precision': 0.44632489878542514, 'eval_recall': 0.48312299465240643, 'eval_runtime': 2.934, 'eval_samples_per_second': 134.971, 'eval_steps_per_second': 17.042, 'epoch': 5.0}
→ Running task2 LoRA on 'cardiffnlp/twitter-roberta-base' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7315,0.750207,0.732323,0.359383,0.338942,0.385118
2,0.7041,0.752313,0.734848,0.421909,0.431194,0.42482
3,0.6621,0.800213,0.732323,0.43869,0.518099,0.426677
4,0.5372,0.780153,0.760101,0.516975,0.628513,0.558734
5,0.5154,0.837619,0.737374,0.482291,0.469546,0.523267
6,0.4242,0.919623,0.724747,0.493155,0.512875,0.482447
7,0.4249,0.753783,0.780303,0.558671,0.583931,0.552376
8,0.4569,0.965523,0.744949,0.493863,0.515208,0.514469
9,0.4552,0.858134,0.742424,0.491586,0.523784,0.47419
10,0.3709,0.994703,0.777778,0.491656,0.569976,0.490829


Validation Results: {'eval_loss': 0.7537825107574463, 'eval_accuracy': 0.7803030303030303, 'eval_f1': 0.5586714972538442, 'eval_precision': 0.5839311896012928, 'eval_recall': 0.5523756047873695, 'eval_runtime': 2.8999, 'eval_samples_per_second': 136.555, 'eval_steps_per_second': 17.242, 'epoch': 10.0}
→ Running task2 LoRA on 'sdadas/xlm-roberta-large-twitter' …


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at sdadas/xlm-roberta-large-twitter and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7409,0.862395,0.734848,0.386805,0.422459,0.421223
2,1.1146,1.258379,0.631313,0.193498,0.157828,0.25
3,1.0984,1.136855,0.631313,0.193498,0.157828,0.25
4,1.0284,1.130536,0.631313,0.193498,0.157828,0.25


Validation Results: {'eval_loss': 0.8623948693275452, 'eval_accuracy': 0.7348484848484849, 'eval_f1': 0.38680477725643225, 'eval_precision': 0.4224587912087912, 'eval_recall': 0.42122268907563026, 'eval_runtime': 9.4964, 'eval_samples_per_second': 41.7, 'eval_steps_per_second': 5.265, 'epoch': 4.0}
→ Running task2 LoRA on 'g8a9/distilroberta-base-twitter-16M_aug-oct22' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at g8a9/distilroberta-base-twitter-16M_aug-oct22 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7375,0.753585,0.75,0.379023,0.347512,0.417353
2,0.7175,0.792206,0.719697,0.443621,0.476239,0.472736
3,0.6866,0.810418,0.744949,0.389683,0.594165,0.419988
4,0.5723,0.773876,0.760101,0.494142,0.474178,0.526881
5,0.5119,0.831431,0.747475,0.452789,0.43374,0.478665
6,0.4693,0.859861,0.75,0.51174,0.531958,0.498707
7,0.4854,0.946036,0.737374,0.488361,0.540885,0.49011
8,0.444,0.896978,0.757576,0.496495,0.56604,0.486287
9,0.4093,1.364657,0.729798,0.442859,0.534763,0.421959


Validation Results: {'eval_loss': 0.8598609566688538, 'eval_accuracy': 0.75, 'eval_f1': 0.5117401196533174, 'eval_precision': 0.5319583369328676, 'eval_recall': 0.4987070282658518, 'eval_runtime': 1.4824, 'eval_samples_per_second': 267.134, 'eval_steps_per_second': 33.729, 'epoch': 9.0}
→ Running task2 LoRA on 'andrea-t94/roberta-fine-tuned-twitter' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at andrea-t94/roberta-fine-tuned-twitter and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7519,0.753085,0.737374,0.367639,0.336681,0.406529
2,0.7247,0.848943,0.69697,0.40444,0.425353,0.444009
3,0.678,0.753751,0.755051,0.398796,0.483965,0.422046
4,0.5999,0.836626,0.712121,0.419165,0.41324,0.435863
5,0.5969,0.71948,0.722222,0.440897,0.421309,0.466287
6,0.4995,0.760065,0.752525,0.520517,0.549982,0.507472
7,0.501,0.951712,0.737374,0.502643,0.545909,0.499379
8,0.5015,0.815754,0.747475,0.484119,0.532126,0.488098
9,0.4787,0.793516,0.775253,0.544136,0.599276,0.525341
10,0.4437,1.035407,0.752525,0.472921,0.693973,0.494017


Validation Results: {'eval_loss': 0.9308944940567017, 'eval_accuracy': 0.7525252525252525, 'eval_f1': 0.5644561586576232, 'eval_precision': 0.5714502381815834, 'eval_recall': 0.5693836261777439, 'eval_runtime': 1.5266, 'eval_samples_per_second': 259.399, 'eval_steps_per_second': 32.752, 'epoch': 15.0}
→ Running task2 LoRA on 'bdotloh/twitter-roberta-base-finetuned-twitter-user-desc' …


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bdotloh/twitter-roberta-base-finetuned-twitter-user-desc and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7542,0.759485,0.727273,0.356658,0.329324,0.388941
2,0.744,0.757709,0.724747,0.438173,0.440476,0.461148
3,0.7151,0.682513,0.755051,0.414585,0.471605,0.435198
4,0.6076,0.748496,0.732323,0.468958,0.456678,0.510809
5,0.5374,0.753089,0.744949,0.465837,0.477781,0.473346
6,0.4851,0.781956,0.744949,0.464195,0.449641,0.481426
7,0.5277,0.933123,0.752525,0.497683,0.550299,0.516838
8,0.5296,0.744259,0.752525,0.469116,0.459911,0.487993
9,0.4711,0.740664,0.747475,0.522586,0.536551,0.514304
10,0.4371,0.834895,0.757576,0.4544,0.683406,0.46463


Validation Results: {'eval_loss': 0.740664005279541, 'eval_accuracy': 0.7474747474747475, 'eval_f1': 0.5225862372921196, 'eval_precision': 0.5365513311165485, 'eval_recall': 0.5143041762159409, 'eval_runtime': 2.9251, 'eval_samples_per_second': 135.378, 'eval_steps_per_second': 17.093, 'epoch': 12.0}
                                               model  eval_loss  \
0          cardiffnlp/twitter-roberta-base-2022-154m   0.842113   
1         cardiffnlp/twitter-roberta-large-2022-154m   0.695258   
2                cardiffnlp/twitter-xlm-roberta-base   0.830945   
3                    cardiffnlp/twitter-roberta-base   0.753783   
4                   sdadas/xlm-roberta-large-twitter   0.862395   
5      g8a9/distilroberta-base-twitter-16M_aug-oct22   0.859861   
6              andrea-t94/roberta-fine-tuned-twitter   0.930894   
7  bdotloh/twitter-roberta-base-finetuned-twitter...   0.740664   

   eval_accuracy   eval_f1  eval_precision  eval_recall  eval_runtime  \
0       0.777778  0.5

In [None]:
import numpy as np
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder

params_twitter_roberta = {
    "num_train_epochs": 100,
    "learning_rate": 0.001,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "logging_steps": 100,
}

best_model_name = "cardiffnlp/twitter-roberta-base"

optimized_2 = optimize_lora_hyperparams(
    task_num=2,
    best_model_name=best_model_name,
    params_base=params_twitter_roberta,
    trainInfo=EnTrainTask2,
    devInfo=EnDevTask2,
    n_trials=20
)


print(optimized_2)
# [I 2025-05-13 18:08:16,826] Trial 4 finished with value: 0.5675971449024997 and parameters: {'learning_rate': 0.0003822078857884255, 'r': 11, 'lora_alpha': 29}. Best is trial 4 with value: 0.5675971449024997.
#[I 2025-05-13 20:24:40,021] Trial 4 finished with value: 0.5597586424127885 and parameters: {'learning_rate': 9.1657096479339e-05, 'r': 8, 'lora_alpha': 59}. Best is trial 4 with value: 0.5597586424127885.


In [None]:
# prompt: #[I 2025-05-13 20:24:40,021] Trial 4 finished with value: 0.5597586424127885 and parameters: {'learning_rate': 9.1657096479339e-05, 'r': 8, 'lora_alpha': 59}. Best is trial 4 with value: 0.5597586424127885.
# convierte junto con params twitter  aun diccionario llamado optimized_2, tienes que inlcuir los parms twitter tambie

optimized_2 = {
    'num_train_epochs': 100,
    'learning_rate': 9.1657096479339e-05,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'logging_steps': 100,
    'r': 8,
    'lora_alpha': 59
}


In [None]:
final_model_2, final_metrics_2 = sexism_classification_pipeline_task2_LoRA(
    EnTrainTask2, EnDevTask2, None,
    best_model_name, 4, "single_label_classification",
    **optimized_2
)


In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Dataset for Task 2
class EnTestDatasetTask2(Dataset):
    def __init__(self, ids, texts, tokenizer, max_length=128):
        self.ids = ids
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        item_id = self.ids[idx]
        text = self.texts[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        batch = {k: v.squeeze(0) for k, v in enc.items()}
        batch["id"] = item_id
        return batch

# 2. Load tokenizer + model (4 classes)
modelname = best_model_name  # e.g. "cardiffnlp/twitter-roberta-base-2022-154m"
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForSequenceClassification.from_pretrained(
    "./final_best_model_mixpeft_2",
    num_labels=4
)
model.eval()
model.cuda()

# 3. Prepare DataLoader
ids_list, texts_list = EnTestTask2[0], EnTestTask2[1]
dataset = EnTestDatasetTask2(ids_list, texts_list, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=False)

# 4. Inference: collect both hard and soft
hard_results = []
soft_results = []
softmax = torch.nn.Softmax(dim=-1)
label_map = {0: "NO", 1: "DIRECT", 2: "REPORTED", 3: "JUDGEMENTAL"}

with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Compute probabilities and hard predictions
        probs_tensor = softmax(outputs.logits)          # [B, 4]
        preds_tensor = outputs.logits.argmax(dim=-1)    # [B]

        probs = probs_tensor.cpu().tolist()             # list of [p0, p1, p2, p3]
        preds = preds_tensor.cpu().tolist()             # list of ints in {0,1,2,3}

        for item_id, prob_vec, pred in zip(batch["id"], probs, preds):
            # Hard output
            hard_results.append({
                "test_case": "EXIST2025",
                "id": item_id,
                "value": label_map[pred]
            })
            # Soft output
            soft_results.append({
                "test_case": "EXIST2025",
                "id": item_id,
                "value": {
                    "NO":           prob_vec[0],
                    "DIRECT":       prob_vec[1],
                    "REPORTED":     prob_vec[2],
                    "JUDGEMENTAL":  prob_vec[3]
                }
            })

# 5. Save both JSONs
name = modelname.split("/")[-1]
hard_path = f"predictions_{name}_hard_EnTestTask2.json"
soft_path = f"predictions_{name}_soft_EnTestTask2.json"

with open(hard_path, "w", encoding="utf-8") as f:
    json.dump(hard_results, f, indent=4, ensure_ascii=False)

with open(soft_path, "w", encoding="utf-8") as f:
    json.dump(soft_results, f, indent=4, ensure_ascii=False)

print(f"Saved {len(hard_results)} hard predictions to {hard_path}")
print(f"Saved {len(soft_results)} soft predictions to {soft_path}")


# Task 3

In [None]:
# prompt: haz un pipeline para la seleccion de un mejor modelo para la task3 probando difernetes repositorios de transformers de hugging face

from functools import partial
from sklearn.preprocessing import MultiLabelBinarizer

class SexismDatasetMulti(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)}
# Ejemplo de uso para task 3:

model_names = [
    "cardiffnlp/twitter-roberta-base-2022-154m",
    "cardiffnlp/twitter-roberta-large-2022-154m",
    "cardiffnlp/twitter-xlm-roberta-base",
    "cardiffnlp/twitter-roberta-base",
    "sdadas/xlm-roberta-large-twitter",
    "g8a9/distilroberta-base-twitter-16M_aug-oct22",
    "andrea-t94/roberta-fine-tuned-twitter",
    "bdotloh/twitter-roberta-base-finetuned-twitter-user-desc"
]

params_task3 = {
    "num_train_epochs": 10,  # Ajusta según sea necesario
    "learning_rate": 5e-5,   # Ajusta según sea necesario
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 16,
    "logging_steps": 10,
    "early_stopping_patience": 3
}


df_metrics_task3 = run_lora_experiments(
    task_num=3,
    model_names=model_names,
    params=params_task3,
    trainInfo=EnTrainTask3,
    devInfo=EnDevTask3,
    testInfo=None  # Configura testInfo si tienes datos de test
)

print(df_metrics_task3)

best_model_task3 = select_best_model(df_metrics_task3, 3)
print(f"→ Best model for task 3: {best_model_task3}")




In [None]:

optimized_task3 = optimize_lora_hyperparams(
    task_num=3,
    best_model_name=best_model_task3,
    params_base=params_task3,
    trainInfo=EnTrainTask3,
    devInfo=EnDevTask3,
    n_trials=5 # Reduce el número de trials para pruebas
)

print(optimized_task3)

final_model_task3, final_metrics_task3 = sexism_classification_pipeline_task3_LoRA(
    EnTrainTask3, EnDevTask3, None,
    best_model_task3, 5, "multi_label_classification",
    **optimized_task3
)

In [None]:
resultados=[]
for languaje in ["En", "Sp"]:
  for task in ["1", "2", "3"]:
    params = dict()
    eval(f"incontext_zero_pipeline_task{task}")(model, tokenizer, eval(f"{languaje}DevTask{task}"), eval(f"{languaje}TestTask{task}"), eval(f"output_postprocessing_incontext_zero_s{task}"))
