## Ricarca e Hyperparameter Optimization (HPO) con Optuna

In [1]:
import json
import numpy as np
from datasets import Dataset, DatasetDict

import torch
from transformers import (
    LlamaForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType


In [2]:
# 1) Prepara tokenizer e dati

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    use_fast=True,
    trust_remote_code=True
)

if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})


In [3]:
# 2) Carica e splitta il JSON aumentato (o l’originale se non hai fatto paraphrase)
path = "checkpoints/train_aug.json"
records = json.load(open(path, encoding="utf-8"))

# Dataset HF e split 95/5
ds = Dataset.from_list(records)
ds = ds.train_test_split(test_size=0.05, seed=42)


In [4]:
# 3) Tokenizzazione record-wise
def tokenize_fn(ex):
    prompt = (
            "Base caption: "  + ex["caption"] + "\n" +
            "Emotion: "       + ex["emotion"] + "\n" +
            "Rewrite the above caption to fit the specified emotion:\n"
    )
    tok_inp = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tok_tgt = tokenizer(
        ex["caption"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tok_inp["labels"] = tok_tgt["input_ids"]
    return tok_inp

tokenized = ds.map(
    tokenize_fn,
    batched=False,                  # un record alla volta
    remove_columns=["img_name","caption","emotion"]
)

# DatasetDict per Trainer
tokenized_ds = DatasetDict({
    "train": tokenized["train"],
    "test" : tokenized["test"]
})


Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [5]:
# 4) model_init() con LoRA
def model_init():
    model = LlamaForCausalLM.from_pretrained(
        "meta-llama/Llama-3.2-3B-Instruct",
        torch_dtype=torch.float16
    )
    lora_conf = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8, lora_alpha=16, lora_dropout=0.05,
        target_modules=["q_proj","v_proj"]
    )
    return get_peft_model(model, lora_conf)

In [6]:
# 5) Funzione di metriche (token-level accuracy)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    mask  = labels != -100
    correct = (preds == labels) & mask
    acc = correct.sum() / mask.sum() if mask.sum()>0 else 0.0
    return {"accuracy": float(acc)}

In [7]:
# 6) Spazio di ricerca (Optuna)
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 3),
        "per_device_train_batch_size": trial.suggest_categorical(
            "per_device_train_batch_size", [1,2,4]
        )
    }

In [10]:
# 7) Imposta Trainer e lancia hyperparameter_search
base_args = TrainingArguments(
    output_dir="hpo",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model_init=model_init,
    args=base_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics=compute_metrics
)

best_run = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    n_trials=10
)
print("Migliori iperparametri:", best_run.hyperparameters)
print("Best metric:", best_run.objective)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[I 2025-06-14 17:11:56,657] A new study created in memory with name: no-name-ad14d057-abf4-4d68-8c85-dfe186e0d2c5
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[W 2025-06-14 17:12:15,836] Trial 0 failed with parameters: {'learning_rate': 0.0001440784870968053, 'num_train_epochs': 3, 'per_device_train_batch_size': 2} because of the following error: RuntimeError('CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n').
Traceback (most recent call last):
  File "C:\Users\jinet\anaconda3\envs\whisperGPUNuovo\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\jinet\anaconda3\envs\whisperGPUNuovo\lib\site-packages\transformers\integrations\integration_utils.py", line 255, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "C:\Users\jinet\anaconda3\envs\whisperGPUNuovo\lib\site-packages\transformers\trainer.py", lin

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
