In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight):
    """
    Crea e restituisce l'ambiente HalfCheetah-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    return gym.make("HalfCheetah-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight)

# Hyperparameter tuning con Optuna
def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.05, 0.2)   # Default ~0.1, esploriamo tra 0.05 e 0.2
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.8, 1.2)  # Default tipico è 1, esploriamo tra 0.8 e 1.2
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.05, 0.5)  # Default ~0.1, esploriamo tra 0.05 e 0.5
    
    # Crea l'ambiente passando i parametri
    env = make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    # Iperparametri per il modello PPO
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    n_steps = trial.suggest_int('n_steps', 1024, 8192, step=1024)  # HalfCheetah può trarre beneficio da aggiornamenti più frequenti
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])  # Ridotto rispetto ad Ant per migliorare stabilità
    gamma = trial.suggest_float('gamma', 0.98, 0.999)  # Gamma alta per ambienti continui
    gae_lambda = trial.suggest_float('gae_lambda', 0.8, 1.0)
    clip_range = trial.suggest_float('clip_range', 0.1, 0.3)
    ent_coef = trial.suggest_float('ent_coef', 0.001, 0.05)  # Meno esplorazione rispetto ad Ant

    # Crea ed allena il modello PPO
    model = PPO("MlpPolicy", env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gamma=gamma,
                gae_lambda=gae_lambda,
                clip_range=clip_range,
                ent_coef=ent_coef,
                verbose=0)
    model.learn(total_timesteps=150000)
    
    # Disabilita aggiornamenti per valutazione
    env.training = False
    env.norm_reward = False  

    # Valuta il modello su 100 episodi
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True)

    print(f'Mean reward: {mean_reward}')
    return mean_reward

# Crea uno studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)


[I 2025-02-13 21:43:09,671] A new study created in memory with name: no-name-57631e5d-2102-4223-81d4-cf16a95d4c50
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[I 2025-02-13 21:44:15,362] Trial 0 finished with value: -238.5711449528653 and parameters: {'reset_noise_scale': 0.15008888328214232, 'forward_reward_weight': 0.8258625261899306, 'ctrl_cost_weight': 0.4537666000320346, 'learning_rate': 4.3893606722627706e-05, 'n_steps': 3072, 'batch_size': 64, 'gamma': 0.9900866464669087, 'gae_lambda': 0.9306819224985757, 'clip_range': 0.22899001125785723, 'ent_coef': 0.027518825151780415}. Best is trial 0 with value: -238.5711449528653.


Mean reward: -238.5711449528653


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[I 2025-02-13 21:45:20,610] Trial 1 finished with value: -86.29464313407327 and parameters: {'reset_noise_scale': 0.16525586112514812, 'forward_reward_weight': 0.8663739366714585, 'ctrl_cost_weight': 0.20795908742728902, 'learning_rate': 2.7422762854827535e-05, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9903076998354103, 'gae_lambda': 0.8727467189953294, 'clip_range': 0.1740375903376737, 'ent_coef': 0.010824104279006284}. Best is trial 1 with value: -86.29464313407327.


Mean reward: -86.29464313407327


[I 2025-02-13 21:46:14,805] Trial 2 finished with value: -662.2001294437225 and parameters: {'reset_noise_scale': 0.11890046769969727, 'forward_reward_weight': 0.8284846831603818, 'ctrl_cost_weight': 0.3335688080597603, 'learning_rate': 2.5750226053273152e-05, 'n_steps': 2048, 'batch_size': 128, 'gamma': 0.9868836973780677, 'gae_lambda': 0.8757250612809824, 'clip_range': 0.24615551182071763, 'ent_coef': 0.012076358199925922}. Best is trial 1 with value: -86.29464313407327.


Mean reward: -662.2001294437225


[I 2025-02-13 21:47:09,811] Trial 3 finished with value: -111.91754679138785 and parameters: {'reset_noise_scale': 0.15334184301300285, 'forward_reward_weight': 1.040317839021006, 'ctrl_cost_weight': 0.32974327138683973, 'learning_rate': 0.00025978818530314983, 'n_steps': 6144, 'batch_size': 128, 'gamma': 0.9852306145002432, 'gae_lambda': 0.838549387693937, 'clip_range': 0.10250888228852795, 'ent_coef': 0.0019189087848801994}. Best is trial 1 with value: -86.29464313407327.


Mean reward: -111.91754679138785


[I 2025-02-13 21:48:14,775] Trial 4 finished with value: -78.75649071303053 and parameters: {'reset_noise_scale': 0.08653722695866556, 'forward_reward_weight': 0.8920716199839906, 'ctrl_cost_weight': 0.26908277567216443, 'learning_rate': 1.9687151357009025e-05, 'n_steps': 3072, 'batch_size': 64, 'gamma': 0.9937547987274471, 'gae_lambda': 0.915897144368942, 'clip_range': 0.24316001289852698, 'ent_coef': 0.01951471674092211}. Best is trial 4 with value: -78.75649071303053.


Mean reward: -78.75649071303053


[I 2025-02-13 21:49:04,080] Trial 5 finished with value: -169.16738089177528 and parameters: {'reset_noise_scale': 0.10571278924426668, 'forward_reward_weight': 0.9142489684687123, 'ctrl_cost_weight': 0.07276775666686734, 'learning_rate': 1.371466204492948e-05, 'n_steps': 2048, 'batch_size': 256, 'gamma': 0.9881310122016278, 'gae_lambda': 0.8201781235858502, 'clip_range': 0.2519446340564947, 'ent_coef': 0.01624812226011962}. Best is trial 4 with value: -78.75649071303053.


Mean reward: -169.16738089177528


[I 2025-02-13 21:50:08,528] Trial 6 finished with value: 20.01807075359393 and parameters: {'reset_noise_scale': 0.18041236397991078, 'forward_reward_weight': 0.807691688570518, 'ctrl_cost_weight': 0.3697452266667385, 'learning_rate': 6.114299089287565e-05, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9819253191303018, 'gae_lambda': 0.8283444090672999, 'clip_range': 0.1479258108510973, 'ent_coef': 0.025812234644515753}. Best is trial 6 with value: 20.01807075359393.


Mean reward: 20.01807075359393


[I 2025-02-13 21:51:12,977] Trial 7 finished with value: -1358.754789697227 and parameters: {'reset_noise_scale': 0.0719410443033492, 'forward_reward_weight': 0.8079894174326131, 'ctrl_cost_weight': 0.47961956759514446, 'learning_rate': 1.582554022730496e-05, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9895868337827359, 'gae_lambda': 0.8258089470360688, 'clip_range': 0.2961352072414478, 'ent_coef': 0.043318253089964606}. Best is trial 6 with value: 20.01807075359393.


Mean reward: -1358.754789697227


[I 2025-02-13 21:52:06,519] Trial 8 finished with value: -209.80434751994727 and parameters: {'reset_noise_scale': 0.06532300954974771, 'forward_reward_weight': 1.0185383002360135, 'ctrl_cost_weight': 0.3745811864525963, 'learning_rate': 4.4232454029389964e-05, 'n_steps': 7168, 'batch_size': 128, 'gamma': 0.9818053994683825, 'gae_lambda': 0.8642584647986157, 'clip_range': 0.28182326154082626, 'ent_coef': 0.035544685390010204}. Best is trial 6 with value: 20.01807075359393.


Mean reward: -209.80434751994727


[I 2025-02-13 21:53:00,625] Trial 9 finished with value: -16.378595204359737 and parameters: {'reset_noise_scale': 0.05298529289744054, 'forward_reward_weight': 1.1556897279629685, 'ctrl_cost_weight': 0.12463802318457128, 'learning_rate': 7.329867619200163e-05, 'n_steps': 6144, 'batch_size': 128, 'gamma': 0.995571871615965, 'gae_lambda': 0.9764894273832903, 'clip_range': 0.20094310740916785, 'ent_coef': 0.01565084542676163}. Best is trial 6 with value: 20.01807075359393.


Mean reward: -16.378595204359737


[I 2025-02-13 21:53:50,816] Trial 10 finished with value: -1433.2054770581944 and parameters: {'reset_noise_scale': 0.1983162366167047, 'forward_reward_weight': 0.9490660859962773, 'ctrl_cost_weight': 0.40919292981797084, 'learning_rate': 0.0009203907025176634, 'n_steps': 1024, 'batch_size': 256, 'gamma': 0.9806812645614821, 'gae_lambda': 0.991883061084188, 'clip_range': 0.13246825198255635, 'ent_coef': 0.04963165089553444}. Best is trial 6 with value: 20.01807075359393.


Mean reward: -1433.2054770581944


[W 2025-02-13 21:54:20,002] Trial 11 failed with parameters: {'reset_noise_scale': 0.1984200497901896, 'forward_reward_weight': 1.1778452833441748, 'ctrl_cost_weight': 0.1205606585575293, 'learning_rate': 0.0001295022723954524, 'n_steps': 6144, 'batch_size': 128, 'gamma': 0.99737077340007, 'gae_lambda': 0.9683809701748427, 'clip_range': 0.17967920847825247, 'ent_coef': 0.02766888647120728} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/fabiodigregorio/Desktop/campus bio iscrizione/ Magistrale/Merone/RL/Reinforcement_Learning_Ant_MuJoCu/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/d5/8f8lxk3x1gddrb19zv8xl57m0000gn/T/ipykernel_17128/2591232108.py", line 41, in objective
    model.learn(total_timesteps=150000)
  File "/Users/fabiodigregorio/Desktop/campus bio iscrizione/ Magistrale/Merone/RL/Reinforcement_Learning_Ant_MuJoCu/venv/lib/pytho

KeyboardInterrupt: 