<h1 align="center">Lab 2:  Sexism Identification in Twitter</h1>
<h2 align="center">Session 3. Transformers: Fine-tuning for multi-label classification
<h3 style="display:block; margin-top:5px;" align="center">Natural Language and Information Retrieval</h3>
<h3 style="display:block; margin-top:5px;" align="center">Degree in Data Science</h3>
<h3 style="display:block; margin-top:5px;" align="center">2024-2025</h3>    
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

### Put your names here

- Marc Siquier
- Marcos Ranchal

In [19]:
  !pip install transformers --upgrade
  !pip  install datasets accelerate



In [20]:
  !pip install -U PyEvALL
  !pip install jupyter --upgrade
  !pip install ipywidgets --upgrade



## Many libraries

In [21]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import random
import os
import pandas as pd
import json
import sys
import tempfile
import time

#Importing the required modules to use the ICM measure

from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils

from functools import partial

In [22]:
# IF YOU USE GOOGLE COLAB -> COLAB=True
COLAB = True

In [23]:
if COLAB is True:
  from google.colab import drive
  drive.mount('/content/drive')
  base_path = "/content/drive/MyDrive"
else:
  base_path = ".."
base_path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive'

## Import readerEXIST2025 library

In [24]:
library_path = base_path
sys.path.append(library_path)
from readerEXIST2025 import EXISTReader

In [25]:
# path to the dataset, adapt this path wherever you have the dataset
dataset_path = os.path.join(base_path, "EXIST_2025_Dataset_V0.2/")

file_train = os.path.join(dataset_path, "EXIST2025_training.json")
file_dev = os.path.join(dataset_path, "EXIST2025_dev.json")

reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)

EnTrainTask3, EnDevTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3")
SpTrainTask3, SpDevTask3 = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3")

# Wrapper to compute ICM measure

In [26]:
def ICMWrapper(pred, labels, multi=False,ids=None):
    test = PyEvALLEvaluation()
    metrics=[MetricFactory.ICM.value]
    params= dict()
    fillLabel=None
    if multi:
        params[PyEvALLUtils.PARAM_REPORT]="embedded"
        hierarchy={"True":['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE'],
        "False":[]}
        params[PyEvALLUtils.PARAM_HIERARCHY]=hierarchy
        fillLabel = lambda x: ["False"] if len(x)== 0 else x
    else:
        params[PyEvALLUtils.PARAM_REPORT]="simple"
        fillLabel = lambda x: str(x)


    truth_name, predict_name=None, None
    if ids is None:
        ids=list(range(len(labels)))

    with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as truth:
        truth_name=truth.name
        truth_df=pd.DataFrame({'test_case': ['EXIST2025']*len(labels),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in labels]})
        if multi==True:
            truth_df=truth_df.astype('object')
        truth.write(truth_df.to_json(orient="records"))

    with  tempfile.NamedTemporaryFile(mode='w', delete=False) as predict:
        predict_name=predict.name
        predict_df=pd.DataFrame({'test_case': ['EXIST2025']*len(pred),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in pred]})
        if multi==True:
            predict_df=predict_df.astype('object')
        predict.write(predict_df.to_json(orient="records"))

    report = test.evaluate(predict_name, truth_name, metrics, **params)
    os.unlink(truth_name)
    os.unlink(predict_name)

    icm = None
    if 'metrics' in report.report:
        if 'ICM' in report.report["metrics"]: icm=float(report.report["metrics"]['ICM']["results"]["average_per_test_case"])
    return icm



## Set the seed

In [27]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

## Dataset class

In [28]:
class SexismDatasetMulti(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)}

# Metrics for subtask 3

In [29]:
def compute_metrics_3(pred, lencoder):
    labels = pred.label_ids
    #preds = pred.predictions.argmax(-1)
    preds = torch.sigmoid(torch.tensor(pred.predictions)).numpy()
    preds_binary = (preds >= 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds_binary, average=None, zero_division=0
    )
    acc = accuracy_score(labels, preds_binary)
    icm= ICMWrapper(lencoder.inverse_transform(preds_binary), lencoder.inverse_transform(labels), multi=True)
    # Macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    metrics = {}
    metrics.update({
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'ICM': icm
    })
    return metrics

# Pipeline

In [30]:
def sexism_classification_pipeline_task3(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=5, ptype="multi_label_classification", **args):
    # Model and Tokenizer
    labelEnc= MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype)

    # Prepare datasets
    train_dataset = SexismDatasetMulti(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDatasetMulti(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"ICM")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #compute_metrics=compute_metrics_3,
        compute_metrics = partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    if testInfo is not None:
      # Prepare test dataset for prediction
      test_dataset = SexismDatasetMulti(testInfo[1], [[0,0,0,0,0]] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

      # Predict test set labels
      predictions = trainer.predict(test_dataset)
      #predicted_labels = np.argmax(predictions.predictions, axis=1)
      predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
      predicted_labels = (predicted_probs >= 0.5).astype(int)

      # Create submission DataFrame
      submission_df = pd.DataFrame({
          'id': testInfo[0],
          'label': labelEnc.inverse_transform(predicted_labels),
          "test_case": ["EXIST2025"]*len(predicted_labels)

      })
      submission_df.to_csv('sexism_predictions_task3.csv', index=False)
      print("Prediction TASK3 completed. Results saved to sexism_predictions_task2.csv")
      return model, submission_df
    return model, eval_results

# LoRA pipeline

In [31]:
# COMPLETE
from peft import LoraConfig, get_peft_model, TaskType

def run_sexism_pipeline_with_lora(train_data, val_data, test_data=None, base_model='roberta-base', num_labels=5, prob_type="multi_label_classification", **kwargs):
    # Initialize tokenizer and model
    binarizer = MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    classification_model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        num_labels=num_labels,
        problem_type=prob_type,
        ignore_mismatched_sizes=kwargs.get("ignore_mismatched_sizes", False)
    )

    # Convert data into datasets
    train_set = SexismDatasetMulti(train_data[1], binarizer.fit_transform(train_data[2]), [int(i) for i in train_data[0]], tokenizer)
    val_set = SexismDatasetMulti(val_data[1], binarizer.transform(val_data[2]), [int(i) for i in val_data[0]], tokenizer)

    # Set up LoRA configuration
    lora_setup = LoraConfig(
        task_type=kwargs.get("task_type", TaskType.SEQ_CLS),
        target_modules=kwargs.get("target_modules", ["query", "value"]),
        r=kwargs.get("rank", 64),
        lora_alpha=kwargs.get("lora_alpha", 32),
        lora_dropout=kwargs.get("lora_dropout", 0.1),
        bias=kwargs.get("bias", "none"),
        init_lora_weights=kwargs.get("init_lora_weights", True)
    )

    # Integrate LoRA into model
    lora_enhanced_model = get_peft_model(classification_model, lora_setup)

    # Define training parameters
    train_params = TrainingArguments(
        output_dir=kwargs.get("output_dir", "./results"),
        num_train_epochs=kwargs.get("num_train_epochs", 5),
        learning_rate=kwargs.get("learning_rate", 5e-5),
        per_device_train_batch_size=kwargs.get("per_device_train_batch_size", 16),
        per_device_eval_batch_size=kwargs.get("per_device_eval_batch_size", 64),
        warmup_steps=kwargs.get("warmup_steps", 500),
        weight_decay=kwargs.get("weight_decay", 0.01),
        logging_dir=kwargs.get("logging_dir", "./logs"),
        logging_steps=kwargs.get("logging_steps", 10),
        eval_strategy=kwargs.get("eval_strategy", "epoch"),
        save_strategy=kwargs.get("save_strategy", "epoch"),
        save_total_limit=kwargs.get("save_total_limit", 1),
        load_best_model_at_end=kwargs.get("load_best_model_at_end", True),
        metric_for_best_model=kwargs.get("metric_for_best_model", "ICM"),
        report_to="none"
    )

    # Set up training loop
    trainer_instance = Trainer(
        model=lora_enhanced_model,
        args=train_params,
        train_dataset=train_set,
        eval_dataset=val_set,
        compute_metrics=partial(compute_metrics_3, lencoder=binarizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=kwargs.get("early_stopping_patience", 3))]
    )

    # Train the model
    trainer_instance.train()

    # Evaluate the model
    validation_metrics = trainer_instance.evaluate()
    print("Validation metrics:", validation_metrics)

    # Save LoRA-only weights
    if kwargs.get("save_lora_weights", True):
        trainer_instance.save_model("./final_best_model_LoRA")

    # Optionally save full model
    if kwargs.get("save_full_model", True):
        combined_model = lora_enhanced_model.merge_and_unload()
        combined_model.save_pretrained("./final_best_model_mixpeft")

    # If test data is provided, make predictions
    if test_data is not None:
        test_set = SexismDatasetMulti(test_data[1], [[0]*num_labels] * len(test_data[1]), [int(i) for i in test_data[0]], tokenizer)
        test_preds = trainer_instance.predict(test_set)
        prob_matrix = torch.sigmoid(torch.tensor(test_preds.predictions)).numpy()
        binarized_preds = (prob_matrix >= 0.5).astype(int)

        results_df = pd.DataFrame({
            'id': test_data[0],
            'label': binarizer.inverse_transform(binarized_preds),
            'test_case': ['EXIST2025'] * len(binarized_preds)
        })

        results_df.to_csv("sexism_predictions_task3.csv", index=False)
        print("Test predictions complete. Saved to sexism_predictions_task3.csv")
        return classification_model, results_df

    return classification_model, validation_metrics


# Experimentation

In [32]:
def export_evaluation_to_file(data: dict, filename: str) -> bool:
    success = False
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4)
        success = True
    except Exception as error:
        print(f"Ocurrió un problema al guardar el archivo: {error}")
    return success

## Do it in English

### Fine-tuning

In [33]:
# COMPLETE
set_seed(25)

base_model = "bert-base-uncased"

# Ajustes optimizados
training_config = {
    "num_train_epochs": 12,                  # Aumento el número de épocas para permitir mayor convergencia
    "learning_rate": 5e-5,                   # Aumento la tasa de aprendizaje para una convergencia más rápida
    "per_device_train_batch_size": 64,       # Mantengo el tamaño del batch
    "warmup_steps": 250,                     # Reduzco ligeramente los warmup_steps para acelerar la convergencia
    "weight_decay": 0.01,                    # Regularización
    "logging_dir": "./logs",
    "logging_steps": 20,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",     # Cambié a f1_macro para optimizar por esta métrica
    "early_stopping_patience": 3,
    "lr_scheduler_type": "cosine",           # Cambié a 'cosine' scheduler para un decaimiento más suave del learning rate
}

# Entrenamiento y evaluación
_, validation_metrics = sexism_classification_pipeline_task3(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **training_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model}_fine-tunning_task_3.json")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6867,0.676101,0.372135,0.582564,0.454126,-0.56748
2,0.6644,0.651035,0.386255,0.570256,0.459654,-0.532264
3,0.6411,0.603337,0.723501,0.608443,0.624122,-0.145577
4,0.5614,0.570582,0.689872,0.73984,0.710534,0.221776
5,0.5082,0.59522,0.74377,0.603594,0.662247,-0.985782
6,0.4864,0.601638,0.752271,0.66385,0.696483,-0.417076
7,0.4058,0.652123,0.740023,0.645738,0.683564,-0.218811


2025-04-14 15:24:51,171 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:24:51,230 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:25:58,169 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:25:58,751 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:26:51,629 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:26:51,695 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:27:56,243 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:27:56,301 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:28:53,578 - pyevall.evaluation - INFO -   

2025-04-14 15:31:33,492 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:31:33,552 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.5705820322036743, 'eval_precision_macro': 0.6898721377487809, 'eval_recall_macro': 0.7398395178400544, 'eval_f1_macro': 0.710533609167568, 'eval_ICM': 0.22177640634325307, 'eval_runtime': 2.5105, 'eval_samples_per_second': 134.632, 'eval_steps_per_second': 2.39, 'epoch': 7.0}


True

In [34]:
base_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"


training_config = {
    "num_train_epochs": 15,                   # Más épocas para permitir mejor ajuste del modelo al nuevo task
    "learning_rate": 3e-5,                    # Más bajo para una fine-tuning más estable
    "per_device_train_batch_size": 32,        # Reducido para evitar overfitting y saturación de memoria
    "per_device_eval_batch_size": 64,
    "warmup_steps": 300,                      # Aumentado para una transición más suave
    "weight_decay": 0.01,                     # Regularización para evitar overfitting
    "ignore_mismatched_sizes": True,          # Necesario por la diferencia en número de clases
    "logging_dir": "./logs",
    "logging_steps": 10,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",      # Optimizar directamente por F1 macro
    "early_stopping_patience": 3,
    "lr_scheduler_type": "cosine",            # Scheduler más suave para ajustar el learning rate
}

# Entrenamiento y evaluación
_, validation_metrics = sexism_classification_pipeline_task3(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **training_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model.replace('/', '_')}_fine-tunning_task_3.json")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6782,0.671097,0.25503,0.4,0.31144,-1.107469
2,0.6168,0.609832,0.700546,0.638686,0.647089,-0.059169
3,0.5923,0.578432,0.692865,0.729657,0.707204,0.166654
4,0.5484,0.595269,0.685587,0.751253,0.705261,0.093685
5,0.4846,0.583287,0.710327,0.721665,0.712909,-0.018152
6,0.417,0.599698,0.71532,0.719777,0.712403,0.087108
7,0.3582,0.671173,0.736804,0.628939,0.675391,-0.474383
8,0.307,0.669529,0.722105,0.705152,0.712395,0.046581


2025-04-14 15:32:24,530 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:32:24,591 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:33:25,942 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:33:26,040 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:34:36,596 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:34:36,659 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:35:45,110 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:35:45,173 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:36:43,996 - pyevall.evaluation - INFO -   

2025-04-14 15:40:26,377 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:40:26,444 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.5832870602607727, 'eval_precision_macro': 0.7103271205280535, 'eval_recall_macro': 0.7216652148709736, 'eval_f1_macro': 0.7129087416883032, 'eval_ICM': -0.018151818686212923, 'eval_runtime': 2.532, 'eval_samples_per_second': 133.491, 'eval_steps_per_second': 2.37, 'epoch': 8.0}


True

### LoRA

In [35]:
model_name = "bert-base-uncased"

# Configuración de entrenamiento con LoRA
lora_config = {
    "num_train_epochs": 12,
    "learning_rate": 2e-5,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 64,
    "warmup_steps": 300,
    "early_stopping_patience": 3,
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "init_lora_weights": True,
    "output_dir": None,
    "save_full_model": False,
    "ignore_mismatched_sizes": True,
    "save_lora_weights": False
}

# Entrenamiento y evaluación usando LoRA
_, eval_metrics = run_sexism_pipeline_with_lora(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=model_name,
    num_labels=5,
    prob_type="multi_label_classification",
    **lora_config
)

# Guardar los resultados de evaluación en Google Drive
output_dir = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(output_dir, exist_ok=True)
filename = f"{drive_path}/eval_{base_model}_lora_task_3.json"
export_evaluation_to_file(eval_metrics, filename)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.7026,0.694467,0.361958,0.414612,0.325741,-1.119278
2,0.6886,0.682701,0.497705,0.455196,0.399128,-0.945465
3,0.6775,0.670084,0.576905,0.548417,0.44826,-0.649417
4,0.6645,0.666123,0.379542,0.543653,0.446263,-0.645942
5,0.6648,0.664616,0.383533,0.533085,0.44399,-0.662721
6,0.6608,0.661731,0.387922,0.502472,0.431258,-0.746904
7,0.6543,0.657982,0.591372,0.489426,0.432836,-0.765827


2025-04-14 15:41:05,139 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:41:05,199 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:41:40,274 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:41:40,337 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:42:15,010 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:42:15,074 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:42:50,296 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:42:50,406 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:43:25,425 - pyevall.evaluation - INFO -   

2025-04-14 15:44:39,298 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:44:39,359 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation metrics: {'eval_loss': 0.66612309217453, 'eval_precision_macro': 0.37954186147538216, 'eval_recall_macro': 0.5436529347429822, 'eval_f1_macro': 0.4462628697467407, 'eval_ICM': -0.6459415438021774, 'eval_runtime': 2.8493, 'eval_samples_per_second': 118.625, 'eval_steps_per_second': 2.106, 'epoch': 7.0}
Ocurrió un problema al guardar el archivo: [Errno 2] No such file or directory: '/content/drive/MyDrive/LNR/eval_results/eval_cardiffnlp/twitter-roberta-base-sentiment-latest_lora_task_3.json'


False

In [36]:
base_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"

training_config = {
    "num_train_epochs": 12,                  # Fine-tuning más estable sin sobreentrenar
    "learning_rate": 2e-5,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 64,
    "warmup_steps": 300,
    "weight_decay": 0.01,
    "ignore_mismatched_sizes": True,
    "logging_dir": "./logs",
    "logging_steps": 10,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "early_stopping_patience": 3,
    "lr_scheduler_type": "cosine",
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "init_lora_weights": True,
    "output_dir": None,
    "save_full_model": False,
    "save_lora_weights": False,
}

# Entrenamiento y evaluación con LoRA
_, validation_metrics = run_sexism_pipeline_with_lora(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **training_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model.replace('/', '_')}_lora_task_3.json")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6782,0.673267,0.560496,0.516585,0.454917,-0.782317
2,0.6529,0.655925,0.507724,0.583119,0.516084,-0.519783
3,0.6644,0.649389,0.523782,0.593419,0.54035,-0.430455
4,0.6422,0.637814,0.684091,0.585335,0.558103,-0.413696
5,0.6166,0.617526,0.726067,0.588437,0.598186,-0.352336
6,0.6114,0.597485,0.702318,0.661735,0.666288,-0.072697
7,0.5834,0.592053,0.708673,0.673953,0.676981,0.03793
8,0.5953,0.589789,0.710917,0.676578,0.680625,-0.061131
9,0.5638,0.587276,0.705322,0.693138,0.691552,-0.07683
10,0.5726,0.586866,0.705059,0.670478,0.679429,-0.234748


2025-04-14 15:45:16,479 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:45:16,542 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:45:51,103 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:45:51,171 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:46:25,738 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:46:25,799 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:47:00,462 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:47:00,529 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:47:35,434 - pyevall.evaluation - INFO -   

2025-04-14 15:51:42,743 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:51:42,844 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation metrics: {'eval_loss': 0.5872756242752075, 'eval_precision_macro': 0.7053217230603127, 'eval_recall_macro': 0.6931378568077129, 'eval_f1_macro': 0.6915515289346169, 'eval_ICM': -0.07682977227107586, 'eval_runtime': 2.7291, 'eval_samples_per_second': 123.849, 'eval_steps_per_second': 2.199, 'epoch': 12.0}


True

## Do it in Spanish

### Fine-tuning

In [37]:
# COMPLETE
base_model = "pysentimiento/robertuito-sentiment-analysis"

training_config = {
    "num_train_epochs": 12,                  # Aumentado para permitir mejor ajuste al dominio
    "learning_rate": 2e-5,                   # Más bajo para estabilidad
    "per_device_train_batch_size": 32,       # Reducido para controlar overfitting
    "per_device_eval_batch_size": 64,
    "warmup_steps": 300,                     # Transición más progresiva al entrenamiento
    "weight_decay": 0.01,                    # Regularización
    "ignore_mismatched_sizes": True,         # Permite adaptar capas de salida
    "logging_dir": "./logs",
    "logging_steps": 10,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",     # Métrica objetivo para clasificación multietiqueta
    "early_stopping_patience": 3,
    "lr_scheduler_type": "cosine"            # Scheduler progresivo
}

# Entrenamiento y evaluación
_, validation_metrics = sexism_classification_pipeline_task3(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **training_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model.replace('/', '_')}_fine-tunning_task_3.json")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6861,0.681441,0.370414,0.6,0.457782,-0.527568
2,0.6354,0.634243,0.750903,0.49102,0.466547,-0.652244
3,0.5868,0.59188,0.719283,0.651956,0.678654,-0.130958
4,0.5527,0.577666,0.706299,0.746616,0.716607,0.230458
5,0.4967,0.574038,0.722922,0.703367,0.709734,-0.070847
6,0.438,0.578021,0.716339,0.718206,0.715417,0.016984
7,0.4128,0.608964,0.738237,0.674064,0.702417,-0.370209


2025-04-14 15:52:33,497 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:52:33,599 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:53:41,974 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:53:42,034 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:54:51,099 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:54:51,160 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:55:58,150 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:55:58,250 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 15:57:13,180 - pyevall.evaluation - INFO -   

2025-04-14 15:59:35,332 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 15:59:35,427 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.5776664018630981, 'eval_precision_macro': 0.7062986694233722, 'eval_recall_macro': 0.746615707759408, 'eval_f1_macro': 0.7166071004461732, 'eval_ICM': 0.2304578656780612, 'eval_runtime': 2.6291, 'eval_samples_per_second': 128.559, 'eval_steps_per_second': 2.282, 'epoch': 7.0}


True

In [None]:
base_model = "finiteautomata/beto-sentiment-analysis"

training_config = {
    "num_train_epochs": 12,                  # Un poco más para mejor convergencia
    "learning_rate": 2e-5,                   # Leve ajuste para fine-tuning estable
    "per_device_train_batch_size": 32,       # Más seguro para GPUs limitadas y mejor generalización
    "per_device_eval_batch_size": 64,
    "warmup_steps": 200,                     # Mejora en la estabilización inicial
    "weight_decay": 0.01,                    # Regularización adicional
    "ignore_mismatched_sizes": True,         # Soporte para distintas capas de salida
    "logging_dir": "./logs",
    "logging_steps": 10,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",     # Optimiza sobre F1 macro
    "early_stopping_patience": 3,
    "lr_scheduler_type": "cosine"            # Scheduler suave
}

# Entrenamiento y evaluación
_, validation_metrics = sexism_classification_pipeline_task3(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **training_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model.replace('/', '_')}_fine-tunning_task_3.json")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6782,0.671097,0.25503,0.4,0.31144,-1.107469
2,0.6168,0.609832,0.700546,0.638686,0.647089,-0.059169
3,0.5923,0.578432,0.692865,0.729657,0.707204,0.166654
4,0.5391,0.575587,0.721765,0.712681,0.710587,-0.032849
5,0.4814,0.596667,0.737547,0.63387,0.677074,-0.535192
6,0.4323,0.590691,0.718573,0.706709,0.708669,0.052134
7,0.393,0.619314,0.729482,0.686724,0.70502,-0.224024


2025-04-14 16:00:26,931 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:00:27,022 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:01:35,274 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:01:35,333 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:02:40,966 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:02:41,031 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:03:42,005 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:03:42,070 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:04:57,875 - pyevall.evaluation - INFO -   

2025-04-14 16:07:22,732 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:07:22,792 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.5755865573883057, 'eval_precision_macro': 0.7217651892919885, 'eval_recall_macro': 0.7126813423245507, 'eval_f1_macro': 0.7105867606684262, 'eval_ICM': -0.03284909186498679, 'eval_runtime': 2.4769, 'eval_samples_per_second': 136.458, 'eval_steps_per_second': 2.422, 'epoch': 7.0}


True

### LoRA

In [None]:
base_model = "pysentimiento/robertuito-sentiment-analysis"

lora_config = {
    "num_train_epochs": 12,                   # Ligeramente más para mejorar el ajuste
    "learning_rate": 2e-5,                    # Ajustado para fine-tuning más fino
    "per_device_train_batch_size": 32,        # Más bajo para prevenir overfitting
    "per_device_eval_batch_size": 64,
    "warmup_steps": 300,                      # Transición más suave
    "early_stopping_patience": 3,
    "r": 32,
    "lora_alpha": 16,
    "lora_dropout": 0.2,
    "bias": "all",
    "init_lora_weights": True,
    "output_dir": "./results",
    "save_full_model": False,
    "ignore_mismatched_sizes": True,
    "save_lora_weights": False,
    "logging_dir": "./logs",
    "logging_steps": 10,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "lr_scheduler_type": "cosine"
}

# Entrenamiento y evaluación usando LoRA
_, validation_metrics = run_sexism_pipeline_with_lora(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **lora_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model.replace('/', '_')}_lora_task_3.json")

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/925 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/435M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6851,0.672049,0.627782,0.644557,0.586938,-0.316812
2,0.6594,0.657148,0.518721,0.549999,0.480509,-0.611206
3,0.657,0.649642,0.536486,0.540155,0.503192,-0.570832
4,0.6407,0.641623,0.749166,0.537933,0.51888,-0.576501


2025-04-14 16:08:05,187 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:08:05,248 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:08:40,706 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:08:40,765 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:09:15,994 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:09:16,057 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:09:51,745 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:09:51,805 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method


2025-04-14 16:09:55,122 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:09:55,182 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation metrics: {'eval_loss': 0.6720494627952576, 'eval_precision_macro': 0.6277820589934715, 'eval_recall_macro': 0.644557368471345, 'eval_f1_macro': 0.58693788895729, 'eval_ICM': -0.31681152170485843, 'eval_runtime': 2.6286, 'eval_samples_per_second': 128.586, 'eval_steps_per_second': 2.283, 'epoch': 4.0}


True

In [None]:
base_model = "finiteautomata/beto-sentiment-analysis"

lora_config = {
    "num_train_epochs": 10,
    "learning_rate": 1e-3,                     # Tasa más alta para probar ajustes más agresivos
    "per_device_train_batch_size": 64,
    "per_device_eval_batch_size": 64,
    "warmup_steps": 100,
    "early_stopping_patience": 2,
    "r": 128,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "bias": "lora_only",
    "init_lora_weights": True,
    "ignore_mismatched_sizes": True,
    "output_dir": "./results",
    "save_full_model": False,
    "save_lora_weights": False,
    "logging_dir": "./logs",
    "logging_steps": 10,
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "lr_scheduler_type": "cosine"
}

# Entrenamiento y evaluación usando LoRA
_, validation_metrics = run_sexism_pipeline_with_lora(
    EnTrainTask3,
    EnDevTask3,
    test_data=None,
    base_model=base_model,
    num_labels=5,
    prob_type="multi_label_classification",
    **lora_config
)

# Guardar resultados de evaluación
drive_path = "/content/drive/MyDrive/LNR/eval_results"
os.makedirs(drive_path, exist_ok=True)
export_evaluation_to_file(validation_metrics, f"{drive_path}/eval_{base_model.replace('/', '_')}_lora_task_3.json")


tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/481k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at finiteautomata/beto-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6688,0.662199,0.389605,0.504056,0.43144,-0.738195
2,0.6522,0.636778,0.699487,0.511365,0.464475,-0.647418
3,0.6293,0.622914,0.69561,0.58968,0.60848,-0.364827
4,0.5988,0.616419,0.680441,0.639876,0.653458,-0.419533
5,0.5672,0.628403,0.733801,0.539777,0.598943,-0.862779
6,0.542,0.608661,0.703249,0.609456,0.646073,-0.720948


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

2025-04-14 16:10:49,138 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:10:49,199 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:11:25,053 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:11:25,113 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:12:00,442 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:12:00,503 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:12:36,216 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:12:36,316 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-14 16:13:11,883 - pyevall.evaluation - INFO -   

2025-04-14 16:13:50,925 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-04-14 16:13:51,020 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation metrics: {'eval_loss': 0.615365207195282, 'eval_precision_macro': 0.6833205570237986, 'eval_recall_macro': 0.6356225136454056, 'eval_f1_macro': 0.6520544172825807, 'eval_ICM': -0.448456961602395, 'eval_runtime': 2.9164, 'eval_samples_per_second': 115.897, 'eval_steps_per_second': 2.057, 'epoch': 6.0}


True

# Show Results

In [None]:
# COMPLETE
import os
import json

def cargar_resultados(modelname, path_resultados):
    resultados = {
        "FineTuned": {},
        "LoRA": {}
    }

    for filename in os.listdir(path_resultados):
        if modelname in filename and filename.endswith(".json"):
            with open(os.path.join(path_resultados, filename), "r") as f:
                data = json.load(f)

            if "fine-tunning" in filename:
                resultados["FineTuned"]["subtask3"] = {
                    "eval_f1_macro": data.get("eval_f1_macro"),
                    "eval_runtime": data.get("eval_runtime"),
                    "eval_ICM": data.get("eval_ICM")
                }

            elif "lora" in filename:
                resultados["LoRA"]["subtask3"] = {
                    "eval_f1_macro": data.get("eval_f1_macro"),
                    "eval_runtime": data.get("eval_runtime"),
                    "eval_ICM": data.get("eval_ICM")
                }

    return resultados
def mostrar_resultados(modelname, path_resultados, idioma="English"):
    resultados = cargar_resultados(modelname, path_resultados)
    print(f"\nResultados para el modelo: {modelname} [{idioma}]")

    print("Fine-tuning:")
    if "subtask3" in resultados["FineTuned"]:
        r = resultados["FineTuned"]["subtask3"]
        print(f"\tSubtask 3 - ICM: {r['eval_ICM']} | F1-macro: {r['eval_f1_macro']} | Runtime: {r['eval_runtime']}s/epoch")
    else:
        print("\tSin resultados.")

    print("LoRA:")
    if "subtask3" in resultados["LoRA"]:
        r = resultados["LoRA"]["subtask3"]
        print(f"\tSubtask 3 - ICM: {r['eval_ICM']} | F1-macro: {r['eval_f1_macro']} | Runtime: {r['eval_runtime']}s/epoch")
    else:
        print("\tSin resultados.")


In [None]:
mostrar_resultados("bert-base-uncased", "/content/drive/MyDrive/LNR/eval_results/")


Resultados para el modelo: bert-base-uncased [English]
Fine-tuning:
	Subtask 3 - ICM: 0.22177640634325307 | F1-macro: 0.710533609167568 | Runtime: 2.5105s/epoch
LoRA:
	Sin resultados.


In [None]:
mostrar_resultados("twitter-roberta-base-sentiment-latest","/content/drive/MyDrive/LNR/eval_results/")


Resultados para el modelo: twitter-roberta-base-sentiment-latest [English]
Fine-tuning:
	Subtask 3 - ICM: -0.018151818686212923 | F1-macro: 0.7129087416883032 | Runtime: 2.532s/epoch
LoRA:
	Subtask 3 - ICM: -0.07682977227107586 | F1-macro: 0.6915515289346169 | Runtime: 2.7291s/epoch


In [None]:
mostrar_resultados("robertuito-sentiment-analysis", "/content/drive/MyDrive/LNR/eval_results/", "Spanish")


Resultados para el modelo: robertuito-sentiment-analysis [Spanish]
Fine-tuning:
	Subtask 3 - ICM: 0.2304578656780612 | F1-macro: 0.7166071004461732 | Runtime: 2.6291s/epoch
LoRA:
	Subtask 3 - ICM: -0.31681152170485843 | F1-macro: 0.58693788895729 | Runtime: 2.6286s/epoch


In [None]:
mostrar_resultados("beto-sentiment-analysis", "/content/drive/MyDrive/LNR/eval_results/","Spanish")


Resultados para el modelo: beto-sentiment-analysis [Spanish]
Fine-tuning:
	Subtask 3 - ICM: -0.03284909186498679 | F1-macro: 0.7105867606684262 | Runtime: 2.4769s/epoch
LoRA:
	Subtask 3 - ICM: -0.448456961602395 | F1-macro: 0.6520544172825807 | Runtime: 2.9164s/epoch
