In [7]:
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

In [8]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight):
    """
    Crea e restituisce l'ambiente HalfCheetah-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    return gym.make("HalfCheetah-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight)

# Hyperparameter tuning con Optuna per SAC
def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.005, 0.3)  
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.7, 1.5)
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.02, 0.3)

    # Crea l'ambiente
    env = make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    # Iperparametri per il modello SAC
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
    buffer_size = trial.suggest_categorical('buffer_size', [100000, 300000, 500000])
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256, 512])
    tau = trial.suggest_float('tau', 0.005, 0.02)
    gamma = trial.suggest_float('gamma', 0.95, 0.9999)

    # Per l’entropia, possiamo scegliere tra un coefficiente fisso oppure "auto"
    # Se si desidera variare su diversi valori, possiamo usare una scelta categoriale
    ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.001, 0.01, 0.05])

    # Frequenza di training e gradient steps
    train_freq = trial.suggest_categorical('train_freq', [1, 64, 256, 512])
    gradient_steps = trial.suggest_categorical('gradient_steps', [1, 8, 16, 32])

    # Crea ed allena il modello SAC
    model = SAC(
        "MlpPolicy",
        env,
        learning_rate=learning_rate,
        buffer_size=buffer_size,
        batch_size=batch_size,
        tau=tau,
        gamma=gamma,
        train_freq=train_freq,          # Quante azioni eseguite prima di un update
        gradient_steps=gradient_steps,  # Numero di passate di gradienti dopo ogni train_freq
        ent_coef=ent_coef,
        verbose=0,
    )

    model.learn(total_timesteps=30_000)

    # Disabilita gli aggiornamenti di normalizzazione per la valutazione
    env.training = False
    env.norm_reward = False

    # Valuta il modello su 100 episodi in modo deterministico
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=60, deterministic=True)

    print(f'Mean reward: {mean_reward}')
    return mean_reward

# Crea lo studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters:", study.best_params)


[I 2025-02-23 12:09:32,075] A new study created in memory with name: no-name-4fedb24e-23b0-4a7e-a95a-11a19de9b6b5
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
[I 2025-02-23 12:09:50,441] Trial 0 finished with value: -1.955694197690328 and parameters: {'reset_noise_scale': 0.13878910117362878, 'forward_reward_weight': 1.4650717081534432, 'ctrl_cost_weight': 0.2885979897914906, 'learning_rate': 8.629085657598991e-06, 'buffer_size': 500000, 'batch_size': 128, 'tau': 0.007169229553311472, 'gamma': 0.9507766264673688, 'ent_coef': 0.05, 'train_freq': 256, 'gradient_steps': 1}. Best is trial 0 with value: -1.955694197690328.


Mean reward: -1.955694197690328


[I 2025-02-23 12:10:16,637] Trial 1 finished with value: -49.78990560391394 and parameters: {'reset_noise_scale': 0.08443024244425591, 'forward_reward_weight': 1.1797300236439838, 'ctrl_cost_weight': 0.14916472656689986, 'learning_rate': 8.9542020972875e-06, 'buffer_size': 100000, 'batch_size': 256, 'tau': 0.009848926871440445, 'gamma': 0.9620054855796288, 'ent_coef': 0.01, 'train_freq': 512, 'gradient_steps': 32}. Best is trial 0 with value: -1.955694197690328.


Mean reward: -49.78990560391394


[I 2025-02-23 12:10:39,868] Trial 2 finished with value: -1.4428011602217927 and parameters: {'reset_noise_scale': 0.20069101158791108, 'forward_reward_weight': 1.472253859983883, 'ctrl_cost_weight': 0.03868385263270402, 'learning_rate': 9.18405793543639e-06, 'buffer_size': 100000, 'batch_size': 512, 'tau': 0.019060006655289487, 'gamma': 0.9593327700351577, 'ent_coef': 'auto', 'train_freq': 256, 'gradient_steps': 8}. Best is trial 2 with value: -1.4428011602217927.


Mean reward: -1.4428011602217927


[I 2025-02-23 12:11:15,025] Trial 3 finished with value: -22.072688845619883 and parameters: {'reset_noise_scale': 0.039771086891144634, 'forward_reward_weight': 1.4742172710145043, 'ctrl_cost_weight': 0.13083133419376947, 'learning_rate': 0.00011502620305512281, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.007306797439651235, 'gamma': 0.9946552499804332, 'ent_coef': 0.05, 'train_freq': 64, 'gradient_steps': 8}. Best is trial 2 with value: -1.4428011602217927.


Mean reward: -22.072688845619883


[I 2025-02-23 12:11:35,631] Trial 4 finished with value: -0.5114571955454305 and parameters: {'reset_noise_scale': 0.10175244498924187, 'forward_reward_weight': 0.8461969577340085, 'ctrl_cost_weight': 0.04512033022702447, 'learning_rate': 0.0007612398762804588, 'buffer_size': 100000, 'batch_size': 512, 'tau': 0.008020302523431019, 'gamma': 0.9979769322574884, 'ent_coef': 'auto', 'train_freq': 512, 'gradient_steps': 8}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -0.5114571955454305


[I 2025-02-23 12:11:55,541] Trial 5 finished with value: -14.206998103794346 and parameters: {'reset_noise_scale': 0.0756957041368532, 'forward_reward_weight': 0.9090408778470807, 'ctrl_cost_weight': 0.23409728560745854, 'learning_rate': 6.206264390312021e-06, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.006010029747528201, 'gamma': 0.9554254804553548, 'ent_coef': 0.001, 'train_freq': 64, 'gradient_steps': 1}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -14.206998103794346


[I 2025-02-23 12:13:00,083] Trial 6 finished with value: -2.7613519838095955 and parameters: {'reset_noise_scale': 0.2635285724348147, 'forward_reward_weight': 1.451909348137433, 'ctrl_cost_weight': 0.04319501953057785, 'learning_rate': 1.3425980866413072e-06, 'buffer_size': 300000, 'batch_size': 64, 'tau': 0.017738372445502786, 'gamma': 0.9762075642301418, 'ent_coef': 0.05, 'train_freq': 64, 'gradient_steps': 32}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -2.7613519838095955


[I 2025-02-23 12:13:20,773] Trial 7 finished with value: -192.4491006567905 and parameters: {'reset_noise_scale': 0.020390472335642812, 'forward_reward_weight': 1.2932692574615952, 'ctrl_cost_weight': 0.25744391872473693, 'learning_rate': 0.00039677326825136405, 'buffer_size': 500000, 'batch_size': 64, 'tau': 0.017400808185691386, 'gamma': 0.9625462215890442, 'ent_coef': 0.01, 'train_freq': 256, 'gradient_steps': 8}. Best is trial 4 with value: -0.5114571955454305.


Mean reward: -192.4491006567905


[I 2025-02-23 13:24:42,928] Trial 8 finished with value: 593.879384215323 and parameters: {'reset_noise_scale': 0.18625446671300852, 'forward_reward_weight': 1.4395544177070303, 'ctrl_cost_weight': 0.1649367223868109, 'learning_rate': 8.542296538176427e-05, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.015821766540131243, 'gamma': 0.9611313608283093, 'ent_coef': 'auto', 'train_freq': 1, 'gradient_steps': 32}. Best is trial 8 with value: 593.879384215323.


Mean reward: 593.879384215323


[I 2025-02-23 13:25:22,079] Trial 9 finished with value: -35.84403375179196 and parameters: {'reset_noise_scale': 0.07699286463458112, 'forward_reward_weight': 0.9896697374587214, 'ctrl_cost_weight': 0.10009659633193074, 'learning_rate': 2.7092100201017118e-05, 'buffer_size': 100000, 'batch_size': 512, 'tau': 0.015023264670683644, 'gamma': 0.952544333965331, 'ent_coef': 0.01, 'train_freq': 256, 'gradient_steps': 32}. Best is trial 8 with value: 593.879384215323.


Mean reward: -35.84403375179196


[I 2025-02-23 13:57:14,323] Trial 10 finished with value: -55.03633944803255 and parameters: {'reset_noise_scale': 0.2948493190393856, 'forward_reward_weight': 1.208721036573466, 'ctrl_cost_weight': 0.20336028478155785, 'learning_rate': 0.00013728970551579026, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.013063114583711182, 'gamma': 0.9752754732479632, 'ent_coef': 'auto', 'train_freq': 1, 'gradient_steps': 16}. Best is trial 8 with value: 593.879384215323.


Mean reward: -55.03633944803255


[I 2025-02-23 13:57:37,743] Trial 11 finished with value: -0.7663095510180293 and parameters: {'reset_noise_scale': 0.1690179808500754, 'forward_reward_weight': 0.7369018391534534, 'ctrl_cost_weight': 0.09363553332660654, 'learning_rate': 0.0009660398880178648, 'buffer_size': 100000, 'batch_size': 512, 'tau': 0.010641214987925054, 'gamma': 0.9973807717079266, 'ent_coef': 'auto', 'train_freq': 512, 'gradient_steps': 16}. Best is trial 8 with value: 593.879384215323.


Mean reward: -0.7663095510180293


[I 2025-02-23 14:21:24,852] Trial 12 finished with value: 658.4202594094461 and parameters: {'reset_noise_scale': 0.1413956096385016, 'forward_reward_weight': 0.7006845125642618, 'ctrl_cost_weight': 0.18926200091947704, 'learning_rate': 0.0001216148125101238, 'buffer_size': 500000, 'batch_size': 512, 'tau': 0.014239027360996071, 'gamma': 0.98650670251583, 'ent_coef': 'auto', 'train_freq': 1, 'gradient_steps': 8}. Best is trial 12 with value: 658.4202594094461.


Mean reward: 658.4202594094461


[I 2025-02-23 15:35:42,829] Trial 13 finished with value: -831.3520673549491 and parameters: {'reset_noise_scale': 0.2128196785384656, 'forward_reward_weight': 1.070042770803264, 'ctrl_cost_weight': 0.18396473573969718, 'learning_rate': 9.911196528414933e-05, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.015191478253965843, 'gamma': 0.985092455563362, 'ent_coef': 'auto', 'train_freq': 1, 'gradient_steps': 32}. Best is trial 12 with value: 658.4202594094461.


Mean reward: -831.3520673549491


[I 2025-02-23 15:54:00,653] Trial 14 finished with value: 3254.8171763902315 and parameters: {'reset_noise_scale': 0.13635555699602933, 'forward_reward_weight': 0.7151140526343989, 'ctrl_cost_weight': 0.19342622590821706, 'learning_rate': 4.3539588088977104e-05, 'buffer_size': 500000, 'batch_size': 256, 'tau': 0.013929154106819306, 'gamma': 0.9843911115842067, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 3254.8171763902315


[I 2025-02-23 16:17:04,779] Trial 15 finished with value: 621.8411591995924 and parameters: {'reset_noise_scale': 0.1359751812835904, 'forward_reward_weight': 0.735089299888663, 'ctrl_cost_weight': 0.22465549069613316, 'learning_rate': 3.406736051063376e-05, 'buffer_size': 500000, 'batch_size': 512, 'tau': 0.012507114563852503, 'gamma': 0.9867393924280535, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 621.8411591995924


[I 2025-02-23 16:32:32,049] Trial 16 finished with value: 2097.1208441172776 and parameters: {'reset_noise_scale': 0.23152250767641086, 'forward_reward_weight': 0.8309752464170082, 'ctrl_cost_weight': 0.1935880860416585, 'learning_rate': 3.1678880573533826e-05, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.01347655189159277, 'gamma': 0.984239381536316, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 2097.1208441172776


[I 2025-02-23 16:47:54,029] Trial 17 finished with value: 2506.8849015334436 and parameters: {'reset_noise_scale': 0.23817830014769378, 'forward_reward_weight': 0.8633227881390402, 'ctrl_cost_weight': 0.26508387495885344, 'learning_rate': 2.8856651005371135e-05, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.010980392460814748, 'gamma': 0.9803887655688588, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 2506.8849015334436


[I 2025-02-23 17:03:39,412] Trial 18 finished with value: 19.52384478759591 and parameters: {'reset_noise_scale': 0.24832563275603003, 'forward_reward_weight': 0.955619433818362, 'ctrl_cost_weight': 0.2768021205003019, 'learning_rate': 2.210442115834014e-06, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.01079180223553318, 'gamma': 0.9686249189086836, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 19.52384478759591


[I 2025-02-23 17:35:08,367] Trial 19 finished with value: 1532.507067497823 and parameters: {'reset_noise_scale': 0.2847540543073047, 'forward_reward_weight': 0.8021899409453827, 'ctrl_cost_weight': 0.25243560945267324, 'learning_rate': 2.177044176541231e-05, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.009043272744137383, 'gamma': 0.9708196738736248, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 16}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 1532.507067497823


[I 2025-02-23 17:37:02,348] Trial 20 finished with value: 945.338913492052 and parameters: {'reset_noise_scale': 0.1107351719043501, 'forward_reward_weight': 0.8902324971943518, 'ctrl_cost_weight': 0.2954553942800961, 'learning_rate': 5.259718112928761e-05, 'buffer_size': 300000, 'batch_size': 64, 'tau': 0.01124933427019597, 'gamma': 0.9802493637551721, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 1}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 945.338913492052


[I 2025-02-23 17:52:20,648] Trial 21 finished with value: 1508.0404527291564 and parameters: {'reset_noise_scale': 0.24039966303929397, 'forward_reward_weight': 0.811181499601905, 'ctrl_cost_weight': 0.21260619884330306, 'learning_rate': 1.608951450299945e-05, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.013805645838198575, 'gamma': 0.9812132755675439, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 1508.0404527291564


[I 2025-02-23 18:08:02,568] Trial 22 finished with value: 3218.4274166426017 and parameters: {'reset_noise_scale': 0.22018312547379437, 'forward_reward_weight': 0.9871883902601919, 'ctrl_cost_weight': 0.24117961102321478, 'learning_rate': 4.890970150810476e-05, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.011924308495540575, 'gamma': 0.9914991393790443, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 3218.4274166426017


[I 2025-02-23 18:23:40,644] Trial 23 finished with value: 2682.989667459523 and parameters: {'reset_noise_scale': 0.169196947073717, 'forward_reward_weight': 1.0277916449907698, 'ctrl_cost_weight': 0.24714451661754258, 'learning_rate': 5.522215108386716e-05, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.012274620888838465, 'gamma': 0.9915516919961503, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8}. Best is trial 14 with value: 3254.8171763902315.


Mean reward: 2682.989667459523


[W 2025-02-23 18:29:06,380] Trial 24 failed with parameters: {'reset_noise_scale': 0.16731841286058588, 'forward_reward_weight': 1.0594943649394934, 'ctrl_cost_weight': 0.23657468751702668, 'learning_rate': 0.0002779745914996516, 'buffer_size': 300000, 'batch_size': 128, 'tau': 0.011876732693483865, 'gamma': 0.9905317825475765, 'ent_coef': 0.001, 'train_freq': 1, 'gradient_steps': 8} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/fabiodigregorio/Desktop/campus bio iscrizione/ Magistrale/Merone/RL/Reinforcement_Learning_Ant_MuJoCu/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/d5/8f8lxk3x1gddrb19zv8xl57m0000gn/T/ipykernel_61413/2280089131.py", line 52, in objective
    model.learn(total_timesteps=30_000)
  File "/Users/fabiodigregorio/Desktop/campus bio iscrizione/ Magistrale/Merone/RL/Reinforcement_Learning_Ant_MuJoCu/venv/lib/python3.10/

KeyboardInterrupt: 