In [4]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import numpy as np
import tensorboard
import optuna

In [5]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward, contact_cost_weight, healthy_z_range, contact_force_range):
    """
    Crea e restituisce l'ambiente Ant-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    # Ant-v5 è l’ambiente più recente in Gymnasium.
    return gym.make("Ant-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight, 
                    healthy_reward=healthy_reward, 
                    contact_cost_weight = contact_cost_weight,
                    healthy_z_range=healthy_z_range,
                    contact_force_range=contact_force_range)
                   # render_mode='none')

In [6]:
# Hyperparameter tuning con Optuna

def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.05, 0.2)           # Default circa 0.1; esploriamo da 0.05 a 0.2
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.5, 1.5)     # Default tipico è 1; esploriamo da 0.5 a 1.5
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.1, 1.0)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
    healthy_reward = trial.suggest_float('healthy_reward', 0.5, 1.5)                   # Default tipico 1; esploriamo da 0.5 a 1.5
    
    # Parametri aggiuntivi per Ant-v5
    contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-4, 1e-3)  # Es. range intorno a 5e-4 come default
    healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.3)             # Per definire l'intervallo di altezze "sane"
    healthy_z_upper = trial.suggest_float('healthy_z_upper', 0.8, 1.2)
    contact_force_min = trial.suggest_float('contact_force_min', -1.0, -0.5)         # Modificabile se usi forze di contatto
    contact_force_max = trial.suggest_float('contact_force_max', 0.5, 1.0)  

    # Crea l'ambiente passando tutti i parametri
    env = make_env(
        reset_noise_scale,
        forward_reward_weight,
        ctrl_cost_weight,
        healthy_reward,
        contact_cost_weight=contact_cost_weight,
        healthy_z_range=(healthy_z_lower, healthy_z_upper),
        contact_force_range=(contact_force_min, contact_force_max)
    )
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    # Iperparametri per il modello PPO
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    n_steps = trial.suggest_int('n_steps', 2048, 8192, step=2048)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])  
    # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
    gamma = trial.suggest_float('gamma', 0.99, 0.999)
    gae_lambda = trial.suggest_float('gae_lambda', 0.8, 1.0)
    clip_range = trial.suggest_float('clip_range', 0.1, 0.3) 
    ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)
    
    # Nuovo iperparametro per la penalizzazione della varianza
    variance_penalty_weight = trial.suggest_float('variance_penalty_weight', 0.0, 0.5)

    # Crea ed allena il modello PPO
    model = PPO("MlpPolicy", env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gamma=gamma,
                gae_lambda=gae_lambda,
                clip_range=clip_range,
                ent_coef=ent_coef,
                verbose=0)
    model.learn(total_timesteps=200000)

    # Valuta il modello su 200 episodi (200 è ottimale)
    episodes = 150
    episode_rewards = []
    for episode in range(episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
        episode_rewards.append(episode_reward)

    # Calcola reward media e varianza
    mean_reward = np.mean(episode_rewards)
    reward_variance = np.var(episode_rewards)

    # Definisce l'obiettivo: massimizzare la reward media penalizzando la varianza
    score = mean_reward - variance_penalty_weight * reward_variance

    print(f'Mean is: {mean_reward}, Variance is: {reward_variance}\n')

    return score

# Crea uno studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)

[I 2025-02-11 14:33:10,145] A new study created in memory with name: no-name-62f66b6f-26af-4b89-bedc-d8d78f8f518b


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[I 2025-02-11 14:34:19,320] Trial 0 finished with value: -0.40184521675109863 and parameters: {'reset_noise_scale': 0.17328697116919345, 'forward_reward_weight': 0.8600760817025472, 'ctrl_cost_weight': 0.6533513452331322, 'healthy_reward': 0.8881430529953126, 'contact_cost_weight': 0.00021706031816801983, 'healthy_z_lower': 0.18487760276555745, 'healthy_z_upper': 0.8736510511245551, 'contact_force_min': -0.8504374331245832, 'contact_force_max': 0.7191725539574108, 'learning_rate': 0.00024366394019125905, 'n_steps': 4096, 'batch_size': 256, 'gamma': 0.9941622716086058, 'gae_lambda': 0.8603002500571459, 'clip_range': 0.2743338408847588, 'ent_coef': 0.021435338193808297, 'variance_penalty_weight': 0.38986364912966875}. Best is trial 0 with value: -0.40184521675109863.


Mean is: -0.3652510941028595, Variance is: 0.09386386722326279



[I 2025-02-11 14:35:35,460] Trial 1 finished with value: -2.3926029205322266 and parameters: {'reset_noise_scale': 0.15057781462445988, 'forward_reward_weight': 0.7623185185030785, 'ctrl_cost_weight': 0.5953641858803702, 'healthy_reward': 1.1364343732050866, 'contact_cost_weight': 0.00010699224876114234, 'healthy_z_lower': 0.2996324727057889, 'healthy_z_upper': 0.8878065627772774, 'contact_force_min': -0.7724886039906093, 'contact_force_max': 0.6613403758444628, 'learning_rate': 1.5661848374396858e-05, 'n_steps': 2048, 'batch_size': 128, 'gamma': 0.9932113330391995, 'gae_lambda': 0.9215302430788892, 'clip_range': 0.2399985875632972, 'ent_coef': 0.041977432377012694, 'variance_penalty_weight': 0.3700584298634591}. Best is trial 0 with value: -0.40184521675109863.


Mean is: -1.5405153036117554, Variance is: 2.3025755882263184



[I 2025-02-11 14:37:04,577] Trial 2 finished with value: -34.74900436401367 and parameters: {'reset_noise_scale': 0.1304567945726224, 'forward_reward_weight': 1.229001052172427, 'ctrl_cost_weight': 0.8701336231865623, 'healthy_reward': 1.1800289957042578, 'contact_cost_weight': 0.00032588775735957714, 'healthy_z_lower': 0.11415467213996665, 'healthy_z_upper': 1.1591309556758596, 'contact_force_min': -0.8587445528426116, 'contact_force_max': 0.6836297017303263, 'learning_rate': 0.0009639437906473453, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9950188086336075, 'gae_lambda': 0.8953745528555077, 'clip_range': 0.2421411547900281, 'ent_coef': 0.09681104942573365, 'variance_penalty_weight': 0.4575936183756125}. Best is trial 0 with value: -0.40184521675109863.


Mean is: -4.033448219299316, Variance is: 67.12409210205078



[I 2025-02-11 14:38:24,728] Trial 3 finished with value: -72.65753936767578 and parameters: {'reset_noise_scale': 0.1904233029576568, 'forward_reward_weight': 0.563087052861689, 'ctrl_cost_weight': 0.6582240083326354, 'healthy_reward': 0.6363738573248237, 'contact_cost_weight': 0.0003672120173078603, 'healthy_z_lower': 0.21790712058963174, 'healthy_z_upper': 1.1023284308346457, 'contact_force_min': -0.6901925827206246, 'contact_force_max': 0.7662047764620082, 'learning_rate': 1.0963424324436151e-05, 'n_steps': 8192, 'batch_size': 256, 'gamma': 0.9926768833590529, 'gae_lambda': 0.9675394164482849, 'clip_range': 0.15891952184038521, 'ent_coef': 0.07602690122245867, 'variance_penalty_weight': 0.44004350361634237}. Best is trial 0 with value: -0.40184521675109863.


Mean is: -7.874885082244873, Variance is: 147.21875



[I 2025-02-11 14:39:43,311] Trial 4 finished with value: -14.495904922485352 and parameters: {'reset_noise_scale': 0.17651228011678444, 'forward_reward_weight': 0.8492747473373999, 'ctrl_cost_weight': 0.7769213131954907, 'healthy_reward': 1.4787183557427297, 'contact_cost_weight': 0.000754775624732145, 'healthy_z_lower': 0.19691142003048873, 'healthy_z_upper': 1.1436714574561322, 'contact_force_min': -0.8400298834889037, 'contact_force_max': 0.5088081588994127, 'learning_rate': 0.00020344883713459695, 'n_steps': 2048, 'batch_size': 128, 'gamma': 0.9912114414926481, 'gae_lambda': 0.8981754913395311, 'clip_range': 0.14021708149057546, 'ent_coef': 0.09257386493097884, 'variance_penalty_weight': 0.40898988481233256}. Best is trial 0 with value: -0.40184521675109863.


Mean is: -2.186119318008423, Variance is: 30.098020553588867



[I 2025-02-11 14:41:12,425] Trial 5 finished with value: 0.7956998348236084 and parameters: {'reset_noise_scale': 0.0593261150539349, 'forward_reward_weight': 1.1733826699836758, 'ctrl_cost_weight': 0.10703358547254146, 'healthy_reward': 0.6694904136113967, 'contact_cost_weight': 0.0005387885057306396, 'healthy_z_lower': 0.16382329499218154, 'healthy_z_upper': 0.92422095006662, 'contact_force_min': -0.5397568710239219, 'contact_force_max': 0.9026151334641952, 'learning_rate': 0.0008352800532578966, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9956046256860938, 'gae_lambda': 0.813547535524508, 'clip_range': 0.21744779016571314, 'ent_coef': 0.004847346816009169, 'variance_penalty_weight': 0.2341583680120119}. Best is trial 5 with value: 0.7956998348236084.


Mean is: 3.594954490661621, Variance is: 11.954536437988281



[I 2025-02-11 14:42:24,216] Trial 6 finished with value: -3.961867332458496 and parameters: {'reset_noise_scale': 0.15195704785399983, 'forward_reward_weight': 0.6531678654078539, 'ctrl_cost_weight': 0.8607916977783684, 'healthy_reward': 0.8926341629783984, 'contact_cost_weight': 0.00014571485564291445, 'healthy_z_lower': 0.1866005865301333, 'healthy_z_upper': 0.9138424111463235, 'contact_force_min': -0.6394743795364903, 'contact_force_max': 0.9467747702422606, 'learning_rate': 2.2482327142840208e-05, 'n_steps': 6144, 'batch_size': 256, 'gamma': 0.9925948834363578, 'gae_lambda': 0.8796872374594148, 'clip_range': 0.24205477173278367, 'ent_coef': 0.0291362108035684, 'variance_penalty_weight': 0.4896564766380902}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -1.1278890371322632, Variance is: 5.787686824798584



[I 2025-02-11 14:43:32,619] Trial 7 finished with value: -0.924700915813446 and parameters: {'reset_noise_scale': 0.14664603079416472, 'forward_reward_weight': 1.0820317850588936, 'ctrl_cost_weight': 0.6495693414447441, 'healthy_reward': 0.5471493068863464, 'contact_cost_weight': 0.00017001557875945662, 'healthy_z_lower': 0.25265217658579886, 'healthy_z_upper': 0.9294607969934955, 'contact_force_min': -0.61842719997662, 'contact_force_max': 0.9783103678051797, 'learning_rate': 0.00011697423759591318, 'n_steps': 2048, 'batch_size': 256, 'gamma': 0.9921883619955038, 'gae_lambda': 0.8983069902414027, 'clip_range': 0.15828114267699162, 'ent_coef': 0.0015845737751152722, 'variance_penalty_weight': 0.48449030816042393}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -0.7986928224563599, Variance is: 0.2600838243961334



[I 2025-02-11 14:44:48,769] Trial 8 finished with value: -0.7374086380004883 and parameters: {'reset_noise_scale': 0.1931761683594473, 'forward_reward_weight': 1.2184383999491226, 'ctrl_cost_weight': 0.8671201482205122, 'healthy_reward': 1.4144891888937592, 'contact_cost_weight': 0.0004215776835203815, 'healthy_z_lower': 0.13286468269419058, 'healthy_z_upper': 0.8704359028854282, 'contact_force_min': -0.8822097391804096, 'contact_force_max': 0.6431222393744704, 'learning_rate': 1.0609209447243814e-05, 'n_steps': 8192, 'batch_size': 128, 'gamma': 0.9943812794435435, 'gae_lambda': 0.8230000807170763, 'clip_range': 0.23627746455574175, 'ent_coef': 0.004212342749768417, 'variance_penalty_weight': 0.12540786773288598}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -0.6795865297317505, Variance is: 0.46107217669487



[I 2025-02-11 14:46:00,212] Trial 9 finished with value: -1.2454981803894043 and parameters: {'reset_noise_scale': 0.14261902143101204, 'forward_reward_weight': 0.6455921081973452, 'ctrl_cost_weight': 0.4563750982337444, 'healthy_reward': 1.4441654264312715, 'contact_cost_weight': 0.0008415914624248005, 'healthy_z_lower': 0.2725394126358904, 'healthy_z_upper': 0.9689162530936697, 'contact_force_min': -0.9237266281477159, 'contact_force_max': 0.91781938464034, 'learning_rate': 8.929460824582704e-05, 'n_steps': 6144, 'batch_size': 256, 'gamma': 0.9987259119040848, 'gae_lambda': 0.961376848957147, 'clip_range': 0.16388150702708398, 'ent_coef': 0.018611235771152236, 'variance_penalty_weight': 0.2410539347590191}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -0.9003593921661377, Variance is: 1.4317905902862549



[I 2025-02-11 14:47:26,948] Trial 10 finished with value: 0.32437756657600403 and parameters: {'reset_noise_scale': 0.05034661479400332, 'forward_reward_weight': 1.4065591704406546, 'ctrl_cost_weight': 0.11226342837835551, 'healthy_reward': 0.7152930529149085, 'contact_cost_weight': 0.000603485157203732, 'healthy_z_lower': 0.15136460531769183, 'healthy_z_upper': 1.037211941475428, 'contact_force_min': -0.5409541810124172, 'contact_force_max': 0.8407606025600058, 'learning_rate': 0.0007780599430320511, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9966859462484027, 'gae_lambda': 0.8051621829466491, 'clip_range': 0.10951796021675454, 'ent_coef': 0.0577232290673, 'variance_penalty_weight': 0.01847985822233922}. Best is trial 5 with value: 0.7956998348236084.


Mean is: 0.37593552470207214, Variance is: 2.7899537086486816



[I 2025-02-11 14:48:51,958] Trial 11 finished with value: -0.019970256835222244 and parameters: {'reset_noise_scale': 0.05581940654100805, 'forward_reward_weight': 1.4970463804352077, 'ctrl_cost_weight': 0.11560140259463789, 'healthy_reward': 0.682089956012442, 'contact_cost_weight': 0.0006081209590778885, 'healthy_z_lower': 0.14740184561046335, 'healthy_z_upper': 1.0421905253396808, 'contact_force_min': -0.5004604189578925, 'contact_force_max': 0.8433855915053747, 'learning_rate': 0.0009404686124780197, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9966230720579664, 'gae_lambda': 0.8028825814559507, 'clip_range': 0.10556682483510971, 'ent_coef': 0.06300406907323851, 'variance_penalty_weight': 0.0015548199851837252}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -0.012539888732135296, Variance is: 4.778924942016602



[I 2025-02-11 14:50:17,801] Trial 12 finished with value: 0.35466718673706055 and parameters: {'reset_noise_scale': 0.052514717404541764, 'forward_reward_weight': 1.4874922210708887, 'ctrl_cost_weight': 0.10950612695045299, 'healthy_reward': 0.7772623627263485, 'contact_cost_weight': 0.0005886614731472145, 'healthy_z_lower': 0.15611625013556837, 'healthy_z_upper': 1.0336212730185927, 'contact_force_min': -0.5188219073057422, 'contact_force_max': 0.8310442308621917, 'learning_rate': 0.00045635539262302137, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9968048972662475, 'gae_lambda': 0.8470909564459465, 'clip_range': 0.2087443684942592, 'ent_coef': 0.055395735650562765, 'variance_penalty_weight': 0.007263987090447344}. Best is trial 5 with value: 0.7956998348236084.


Mean is: 0.3818242847919464, Variance is: 3.738593339920044



[I 2025-02-11 14:51:44,343] Trial 13 finished with value: -2.8803598880767822 and parameters: {'reset_noise_scale': 0.08348602928152087, 'forward_reward_weight': 1.34833232213463, 'ctrl_cost_weight': 0.2961587484962852, 'healthy_reward': 0.8245983846806295, 'contact_cost_weight': 0.0009873867806992447, 'healthy_z_lower': 0.1669254365031691, 'healthy_z_upper': 0.8151602115817592, 'contact_force_min': -0.5701840610632738, 'contact_force_max': 0.859937016189685, 'learning_rate': 0.00036589699981441, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9964549173808976, 'gae_lambda': 0.845840384697651, 'clip_range': 0.19717118851805138, 'ent_coef': 0.04362911131113855, 'variance_penalty_weight': 0.24144730597162173}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -0.28820210695266724, Variance is: 10.735915184020996



[I 2025-02-11 14:53:10,665] Trial 14 finished with value: -2.359555721282959 and parameters: {'reset_noise_scale': 0.09211592686797554, 'forward_reward_weight': 1.1038949560115847, 'ctrl_cost_weight': 0.29231681368235013, 'healthy_reward': 1.0526105268110622, 'contact_cost_weight': 0.0005075844698602804, 'healthy_z_lower': 0.10588663663371299, 'healthy_z_upper': 1.004950338679169, 'contact_force_min': -0.7046212439205792, 'contact_force_max': 0.7860034751282772, 'learning_rate': 5.0060320604016884e-05, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9984747555845591, 'gae_lambda': 0.8419580527663166, 'clip_range': 0.20480041559940193, 'ent_coef': 0.07625395348031483, 'variance_penalty_weight': 0.14255326547809696}. Best is trial 5 with value: 0.7956998348236084.


Mean is: -0.7026627659797668, Variance is: 11.622973442077637



[W 2025-02-11 14:54:01,567] Trial 15 failed with parameters: {'reset_noise_scale': 0.08141438982798507, 'forward_reward_weight': 1.2672251093423883, 'ctrl_cost_weight': 0.2904860914684766, 'healthy_reward': 0.5290929428345446, 'contact_cost_weight': 0.0007063858031248777, 'healthy_z_lower': 0.22485444184857956, 'healthy_z_upper': 1.0659064133405933, 'contact_force_min': -0.5933017073539808, 'contact_force_max': 0.9004982797402342, 'learning_rate': 0.0004601703506883517, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9958292438961996, 'gae_lambda': 0.9995679087193103, 'clip_range': 0.203176645089613, 'ent_coef': 0.035897374727136125, 'variance_penalty_weight': 0.1021124680051293} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/ignazioemanuelepicciche/Documents/Ignazio PC/ucbm/deep_learning/Reinforcement_Learning_MuJoCu/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(tr

KeyboardInterrupt: 