In [None]:
import os
import gc
import pandas as pd
import numpy as np
import shutil
import torch
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import random
import json
import re


from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback, AutoConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm
from torch.nn import CrossEntropyLoss
from huggingface_hub import create_repo, upload_folder, notebook_login
from utils_dl import set_global_seed

try: 
    from sklearn.model_selection import StratifiedGroupKFold
    GROUP_SPLITTER = StratifiedGroupKFold(n_splits=8,
                                          shuffle=True,
                                          random_state=42)
except ImportError as e:
    print('ImportError:', e)
    GROUP_SPLITTER = None  # fallback later

torch.cuda.empty_cache()

In [None]:
SEED=42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SUBTASK_1_PATH = "new_data\subtask1"

set_global_seed(SEED)
notebook_login() # Log into HF account

In [16]:
# Configuration parameters
language = 'spa'
model_type = 'dl'
stemming = False
lemmatization = False
remove_duplicates = True
cased = True 

data_config = f"lang_{language}_model_{model_type}_stem_{stemming}_lem_{lemmatization}_dup_{remove_duplicates}_cased_{cased}"
file_name = 'subtask1_balanced_aug_v2' 
db_file_name = f"{file_name}_{data_config}.csv"
file_path = os.path.join(SUBTASK_1_PATH, db_file_name)

if os.path.exists(file_path):
    full_data = pd.read_csv(file_path, encoding='utf-8')
else:
    raise FileNotFoundError(f"File not found at {file_path}")

In [17]:
full_data["id"] = full_data["id"].astype(str)

In [None]:
text_column  = "lyrics_clean"
label_column = "label"
group_column = "id"             # all augmented variants share this id
aug_col      = "is_augmented"   
final_training = False 
val_split_size = 0.1 if final_training else 0.2

if full_data[label_column].dtype == object:
    unique_labels = sorted(full_data[label_column].unique())
    label2id = {lbl: idx for idx, lbl in enumerate(unique_labels)}
else:
    unique_labels = sorted(full_data[label_column].unique())
    label2id = {int(lbl): int(lbl) for lbl in unique_labels}

id2label = {v: k for k, v in label2id.items()}
full_data[label_column] = full_data[label_column].map(label2id)

full_data = full_data.sample(frac = 1, random_state=SEED) 


gss = GroupShuffleSplit(n_splits=1, test_size=val_split_size, random_state=SEED)
train_val_idx, test_idx = next(gss.split(full_data, groups=full_data[group_column]))

train_val_df = full_data.iloc[train_val_idx].reset_index(drop=True)
test_df      = full_data.iloc[test_idx].reset_index(drop=True)

if not final_training:

    if GROUP_SPLITTER:
        sgroups = train_val_df[group_column].values
        ylabels = train_val_df[label_column].values
        trn_idx, val_idx = next(GROUP_SPLITTER.split(np.zeros(len(train_val_df)),
                                                     ylabels, groups=sgroups))
    else:
        print("StratifiedGroupKFold not available, using GroupShuffleSplit")
        gss2 = GroupShuffleSplit(n_splits=1, test_size=0.10, random_state=SEED)
        trn_idx, val_idx = next(gss2.split(train_val_df,
                                           groups=train_val_df[group_column]))
    
    train_df = train_val_df.iloc[trn_idx].reset_index(drop=True)
    val_df   = train_val_df.iloc[val_idx].reset_index(drop=True)
    
    
else:
    train_df = train_val_df.copy() 
    val_df = test_df.copy()


In [19]:
if not final_training:
    val_df  = val_df[val_df[aug_col]  != True]
    test_df = test_df[test_df[aug_col] != True]
else:
    val_df  = val_df[val_df[aug_col]  != True]

In [None]:
save_datasets = True # if True, save songs ids of each split in a separate file. This is useful for training all models with the same dataset splits and get a fair comparison of the results
if not final_training: 

    train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_dataset  = Dataset.from_pandas(val_df.reset_index(drop=True))
    test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))
    
    ds = DatasetDict({
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    })
    if save_datasets:
        splits_folder_path =  os.path.join(SUBTASK_1_PATH, "splits") 
        if not os.path.exists(splits_folder_path):
            os.makedirs(splits_folder_path)
            
        train_df.to_csv(os.path.join(splits_folder_path, 'train.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(train_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'train_ids.csv'), index=False, encoding='utf-8')
        
        val_df.to_csv(os.path.join(splits_folder_path, 'val.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(val_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'val_ids.csv'), index=False, encoding='utf-8')
        
        test_df.to_csv(os.path.join(splits_folder_path, 'test.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(test_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'test_ids.csv'), index=False, encoding='utf-8')
    
    
else: 
    
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset  = Dataset.from_pandas(val_df)
    
    ds = DatasetDict({
        'train': train_dataset,
        'val': val_dataset
    })
    
    if save_datasets:
        splits_folder_path =  os.path.join(SUBTASK_1_PATH, "splits") 
        if not os.path.exists(splits_folder_path):
            os.makedirs(splits_folder_path)
            
        train_df.to_csv(os.path.join(splits_folder_path, 'train_competition.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(train_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'train_competition_ids.csv'), index=False, encoding='utf-8')
        
        val_df.to_csv(os.path.join(splits_folder_path, 'val_competition.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(val_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'val_competition_ids.csv'), index=False, encoding='utf-8')
    

## Text Representation

In [None]:
MODEL_CHECKPOINT = "PlanTL-GOB-ES/roberta-base-bne"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True) # 

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples[text_column],
        padding=False,  # dynamic later
        truncation=True
)

tokenized_ds = ds.map(tokenize_function, batched=True) 


columns_to_keep = {"input_ids", "attention_mask", label_column}
for split in tokenized_ds.keys():    
    tokenized_ds[split] = tokenized_ds[split].remove_columns([col for col in tokenized_ds[split].column_names if col not in columns_to_keep])
    tokenized_ds[split] = tokenized_ds[split].rename_column(label_column, "labels")
    tokenized_ds[split].set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer,
                                        pad_to_multiple_of=8,   
                                        return_tensors="pt")

## Model Learning (HPO - Optuna)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(unique_labels)),
    y=train_df[label_column].to_numpy()
)
weight_tensor = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", weight_tensor)

class BalancedTrainer(Trainer):
    def __init__(self, *args, weight_tensor: torch.Tensor = None, **kwargs):
        super().__init__(*args, **kwargs)
        if weight_tensor is None:
            raise ValueError("You must pass weight_tensor")
        self.weight_tensor = weight_tensor

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # pop labels and forward
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        weight = self.weight_tensor.to(model.device)
        loss_fct = CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

Class weights: tensor([1.0262, 0.9751])


In [26]:
num_labels = len(unique_labels)
optuna_path = "./hpo_{}".format(MODEL_CHECKPOINT.replace("/","_"))
if os.path.exists(optuna_path):
    shutil.rmtree(optuna_path)

In [None]:
def objective(trial):
    lr     = trial.suggest_float("learning_rate", 1e-6, 5e-5, log=True)
    wd     = trial.suggest_float("weight_decay", 0.0, 0.3)
    bsz    = trial.suggest_categorical("batch_size", [16, 32])
    warmup = trial.suggest_float("warmup_ratio", 0.01, 0.12)
    drop_h = trial.suggest_float("hidden_dropout", 0.05, 0.4)
    drop_a = trial.suggest_float("attn_dropout", 0.05, 0.4)
    drop_c = trial.suggest_float("classifier_dropout", 0.05, 0.4)

    def model_init():
        cfg = AutoConfig.from_pretrained(
          MODEL_CHECKPOINT,
          num_labels=len(unique_labels),
          label2id=label2id,
          id2label=id2label,
          hidden_dropout_prob=drop_h,
          attention_probs_dropout_prob=drop_a,
          classifier_dropout=drop_c,
        )
        return AutoModelForSequenceClassification.from_pretrained(
          MODEL_CHECKPOINT, config=cfg
        )

    args = TrainingArguments(
      output_dir                 = os.path.join(optuna_path, f"trial_{trial.number}"),
      eval_strategy              = "steps",
      eval_steps                 = 100,
      logging_strategy           = "steps",
      logging_steps              = 100,
      save_strategy              = "steps",
      save_steps                 = 100,
      save_total_limit           = 1,
      load_best_model_at_end     = True,
      metric_for_best_model      = "eval_f1",
      # gradient_accumulation_steps = max(1, 32 // bsz), 
      greater_is_better          = True,
      learning_rate              = lr,
      weight_decay               = wd,
      per_device_train_batch_size= bsz,
      per_device_eval_batch_size = bsz,
      warmup_ratio               = warmup,
      num_train_epochs           = 12,
      fp16                       = True,
      seed                       = SEED,
      report_to                  = "none",
    )

    trainer = BalancedTrainer(
      model_init     = model_init,
      args           = args,
      train_dataset  = tokenized_ds["train"],
      eval_dataset   = tokenized_ds["val"],
      tokenizer      = tokenizer,
      data_collator  = data_collator,
      compute_metrics= compute_metrics,
      weight_tensor  = weight_tensor,        
      callbacks      = [
        EarlyStoppingCallback(early_stopping_patience=4),
      ],
    )

    trainer.train()
    metrics = trainer.evaluate()
    gc.collect(); torch.cuda.empty_cache()
    return metrics["eval_f1"]

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

#### Save model in Hugging Face Hub

In [None]:

best_trial = study.best_trial.number
trial_dir = os.path.join(optuna_path, f"trial_{best_trial}")

state_fp = os.path.join(trial_dir, "trainer_state.json")

if not os.path.isfile(state_fp):
    pat  = re.compile(r"checkpoint-(\d+)$")
    ckpt_dirs = [d for d in os.listdir(trial_dir) if pat.match(d)]
    if not ckpt_dirs:
        raise FileNotFoundError(f"No checkpoints found in {trial_dir}")

    newest = max(ckpt_dirs, key=lambda d: int(pat.match(d).group(1)))
    state_fp = os.path.join(trial_dir, newest, "trainer_state.json")

    if not os.path.isfile(state_fp):
        raise FileNotFoundError("trainer_state.json not found even in checkpoint.")

with open(state_fp) as f:
    j = json.load(f)
    best_ckpt_path = j["best_model_checkpoint"]
    best_epoch = j["epoch"]

print("Best checkpoint:", best_ckpt_path)

Best checkpoint: ./hpo_PlanTL-GOB-ES_roberta-base-bne/trial_10/checkpoint-1400


In [None]:
model     = AutoModelForSequenceClassification.from_pretrained(best_ckpt_path)
tokenizer = AutoTokenizer.from_pretrained(best_ckpt_path)
best_params = study.best_trial.params 
best_params['epoch'] = best_epoch

repo_id = f"SeTo97/{MODEL_CHECKPOINT.split('/')[-1]}_ft_70"
create_repo(repo_id, private=True, repo_type="model", exist_ok=True)

upload_folder(
    repo_id        = repo_id,
    folder_path    = best_ckpt_path,      
    repo_type      = "model",
    commit_message = (
        f"Best F1={study.best_trial.value:.4f} "
        f"(trial {study.best_trial.number})\n"
        f"Hyper‑parameters: {best_params}"
    ),
)

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

optimizer.pt:   0%|          | 0.00/997M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SeTo97/roberta-base-bne_ft_70/commit/e932916ea207bd095b91f68f5db2dc9efeb8b074', commit_message="Best F1=0.7795 (trial 10)\nHyper‑parameters: {'learning_rate': 4.437063943360846e-06, 'weight_decay': 0.0017391423354692903, 'batch_size': 32, 'warmup_ratio': 0.09464137213269672, 'hidden_dropout': 0.053138253462458006, 'attn_dropout': 0.2735895207101088, 'classifier_dropout': 0.39340169178729456, 'epoch': 10.447761194029852}", commit_description='', oid='e932916ea207bd095b91f68f5db2dc9efeb8b074', pr_url=None, repo_url=RepoUrl('https://huggingface.co/SeTo97/roberta-base-bne_ft_70', endpoint='https://huggingface.co', repo_type='model', repo_id='SeTo97/roberta-base-bne_ft_70'), pr_revision=None, pr_num=None)

## External Evaluation (held-out test set)

In [None]:
model_repo = repo_id

model = AutoModelForSequenceClassification.from_pretrained(model_repo)
tokenizer = AutoTokenizer.from_pretrained(model_repo)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_dataset = tokenized_ds["test"]


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator  
)

results = trainer.evaluate(test_dataset)

print("Test Set Evaluation:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")


  trainer = Trainer(


Test Set Evaluation:
eval_loss: 0.4407
eval_model_preparation_time: 0.0025
eval_accuracy: 0.8112
eval_precision: 0.8002
eval_recall: 0.7835
eval_f1: 0.7902
eval_runtime: 0.6785
eval_samples_per_second: 655.8650
eval_steps_per_second: 82.5360
