In [7]:
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

In [8]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight):
    """
    Crea e restituisce l'ambiente HalfCheetah-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    return gym.make("HalfCheetah-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight)

# Hyperparameter tuning con Optuna per SAC
def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.005, 0.3)  
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.7, 1.5)
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.02, 0.3)

    # Crea l'ambiente
    env = make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    # Iperparametri per il modello SAC
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
    buffer_size = trial.suggest_categorical('buffer_size', [100000, 300000, 500000])
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256, 512])
    tau = trial.suggest_float('tau', 0.005, 0.02)
    gamma = trial.suggest_float('gamma', 0.95, 0.9999)

    # Per l’entropia, possiamo scegliere tra un coefficiente fisso oppure "auto"
    # Se si desidera variare su diversi valori, possiamo usare una scelta categoriale
    ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.001, 0.01, 0.05])

    # Frequenza di training e gradient steps
    train_freq = trial.suggest_categorical('train_freq', [1, 64, 256, 512])
    gradient_steps = trial.suggest_categorical('gradient_steps', [1, 8, 16, 32])

    # Crea ed allena il modello SAC
    model = SAC(
        "MlpPolicy",
        env,
        learning_rate=learning_rate,
        buffer_size=buffer_size,
        batch_size=batch_size,
        tau=tau,
        gamma=gamma,
        train_freq=train_freq,          # Quante azioni eseguite prima di un update
        gradient_steps=gradient_steps,  # Numero di passate di gradienti dopo ogni train_freq
        ent_coef=ent_coef,
        verbose=0,
    )

    model.learn(total_timesteps=30_000)

    # Disabilita gli aggiornamenti di normalizzazione per la valutazione
    env.training = False
    env.norm_reward = False

    # Valuta il modello su 100 episodi in modo deterministico
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=60, deterministic=True)

    print(f'Mean reward: {mean_reward}')
    return mean_reward

# Crea lo studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters:", study.best_params)


[I 2025-02-23 12:09:32,075] A new study created in memory with name: no-name-4fedb24e-23b0-4a7e-a95a-11a19de9b6b5
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
[I 2025-02-23 12:09:50,441] Trial 0 finished with value: -1.955694197690328 and parameters: {'reset_noise_scale': 0.13878910117362878, 'forward_reward_weight': 1.4650717081534432, 'ctrl_cost_weight': 0.2885979897914906, 'learning_rate': 8.629085657598991e-06, 'buffer_size': 500000, 'batch_size': 128, 'tau': 0.007169229553311472, 'gamma': 0.9507766264673688, 'ent_coef': 0.05, 'train_freq': 256, 'gradient_steps': 1}. Best is trial 0 with value: -1.955694197690328.


Mean reward: -1.955694197690328


[I 2025-02-23 12:10:16,637] Trial 1 finished with value: -49.78990560391394 and parameters: {'reset_noise_scale': 0.08443024244425591, 'forward_reward_weight': 1.1797300236439838, 'ctrl_cost_weight': 0.14916472656689986, 'learning_rate': 8.9542020972875e-06, 'buffer_size': 100000, 'batch_size': 256, 'tau': 0.009848926871440445, 'gamma': 0.9620054855796288, 'ent_coef': 0.01, 'train_freq': 512, 'gradient_steps': 32}. Best is trial 0 with value: -1.955694197690328.


Mean reward: -49.78990560391394


[I 2025-02-23 12:10:39,868] Trial 2 finished with value: -1.4428011602217927 and parameters: {'reset_noise_scale': 0.20069101158791108, 'forward_reward_weight': 1.472253859983883, 'ctrl_cost_weight': 0.03868385263270402, 'learning_rate': 9.18405793543639e-06, 'buffer_size': 100000, 'batch_size': 512, 'tau': 0.019060006655289487, 'gamma': 0.9593327700351577, 'ent_coef': 'auto', 'train_freq': 256, 'gradient_steps': 8}. Best is trial 2 with value: -1.4428011602217927.


Mean reward: -1.4428011602217927


[I 2025-02-23 12:11:15,025] Trial 3 finished with value: -22.072688845619883 and parameters: {'reset_noise_scale': 0.039771086891144634, 'forward_reward_weight': 1.4742172710145043, 'ctrl_cost_weight': 0.13083133419376947, 'learning_rate': 0.00011502620305512281, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.007306797439651235, 'gamma': 0.9946552499804332, 'ent_coef': 0.05, 'train_freq': 64, 'gradient_steps': 8}. Best is trial 2 with value: -1.4428011602217927.


Mean reward: -22.072688845619883


[I 2025-02-23 12:11:35,631] Trial 4 finished with value: -0.5114571955454305 and parameters: {'reset_noise_scale': 0.10175244498924187, 'forward_reward_weight': 0.8461969577340085, 'ctrl_cost_weight': 0.04512033022702447, 'learning_rate': 0.0007612398762804588, 'buffer_size': 100000, 'batch_size': 512, 'tau': 0.008020302523431019, 'gamma': 0.9979769322574884, 'ent_coef': 'auto', 'train_freq': 512, 'gradient_steps': 8}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -0.5114571955454305


[I 2025-02-23 12:11:55,541] Trial 5 finished with value: -14.206998103794346 and parameters: {'reset_noise_scale': 0.0756957041368532, 'forward_reward_weight': 0.9090408778470807, 'ctrl_cost_weight': 0.23409728560745854, 'learning_rate': 6.206264390312021e-06, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.006010029747528201, 'gamma': 0.9554254804553548, 'ent_coef': 0.001, 'train_freq': 64, 'gradient_steps': 1}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -14.206998103794346


[I 2025-02-23 12:13:00,083] Trial 6 finished with value: -2.7613519838095955 and parameters: {'reset_noise_scale': 0.2635285724348147, 'forward_reward_weight': 1.451909348137433, 'ctrl_cost_weight': 0.04319501953057785, 'learning_rate': 1.3425980866413072e-06, 'buffer_size': 300000, 'batch_size': 64, 'tau': 0.017738372445502786, 'gamma': 0.9762075642301418, 'ent_coef': 0.05, 'train_freq': 64, 'gradient_steps': 32}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -2.7613519838095955


[I 2025-02-23 12:13:20,773] Trial 7 finished with value: -192.4491006567905 and parameters: {'reset_noise_scale': 0.020390472335642812, 'forward_reward_weight': 1.2932692574615952, 'ctrl_cost_weight': 0.25744391872473693, 'learning_rate': 0.00039677326825136405, 'buffer_size': 500000, 'batch_size': 64, 'tau': 0.017400808185691386, 'gamma': 0.9625462215890442, 'ent_coef': 0.01, 'train_freq': 256, 'gradient_steps': 8}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -192.4491006567905
