In [5]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, SubprocVecEnv
import numpy as np
import tensorboard
import optuna
import gc

# Install tqdm if not already installed
from tqdm.autonotebook import tqdm as notebook_tqdm


from stable_baselines3.common.evaluation import evaluate_policy

In [6]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward, contact_cost_weight, healthy_z_range, contact_force_range):
    """
    Crea e restituisce l'ambiente Ant-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    # Ant-v5 è l’ambiente più recente in Gymnasium.
    return gym.make("Ant-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight, 
                    healthy_reward=healthy_reward, 
                    contact_cost_weight = contact_cost_weight,
                    healthy_z_range=healthy_z_range,
                    contact_force_range=contact_force_range)
                   # render_mode='none')

In [7]:
# V1 max 30

# reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.05, 0.2)           # Default circa 0.1; esploriamo da 0.05 a 0.2
    # forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.5, 1.5)     # Default tipico è 1; esploriamo da 0.5 a 1.5
    # ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.1, 1.0)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
    # healthy_reward = trial.suggest_float('healthy_reward', 0.5, 1.5)                   # Default tipico 1; esploriamo da 0.5 a 1.5
    
    # # Parametri aggiuntivi per Ant-v5
    # contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-4, 1e-3)  # Es. range intorno a 5e-4 come default
    # healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.3)             # Per definire l'intervallo di altezze "sane"
    # healthy_z_upper = trial.suggest_float('healthy_z_upper', 0.8, 1.2)
    # contact_force_min = trial.suggest_float('contact_force_min', -1.0, -0.5)         # Modificabile se usi forze di contatto
    # contact_force_max = trial.suggest_float('contact_force_max', 0.5, 1.0)


    # learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    # n_steps = trial.suggest_int('n_steps', 2048, 8192, step=2048)
    # batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])  
    # # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
    # gamma = trial.suggest_float('gamma', 0.99, 0.999)
    # gae_lambda = trial.suggest_float('gae_lambda', 0.8, 1.0)
    # clip_range = trial.suggest_float('clip_range', 0.1, 0.3) 
    # ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)



# V2 max 1600
 # Parametri dell'environment
# reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.05, 0.2)           # Default circa 0.1; esploriamo da 0.05 a 0.2
# forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.5, 1.5)     # Default tipico è 1; esploriamo da 0.5 a 1.5
# ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.1, 1.0)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
# healthy_reward = trial.suggest_float('healthy_reward', 0.5, 1.5)                   # Default tipico 1; esploriamo da 0.5 a 1.5

# # Parametri aggiuntivi per Ant-v5
# contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-4, 1e-3)  # Es. range intorno a 5e-4 come default
# healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.3)             # Per definire l'intervallo di altezze "sane"
# healthy_z_upper = trial.suggest_float('healthy_z_upper', 0.8, 1.2)
# contact_force_min = trial.suggest_float('contact_force_min', -1.0, -0.5)         # Modificabile se usi forze di contatto
# contact_force_max = trial.suggest_float('contact_force_max', 0.5, 1.0)

# # Iperparametri per il modello PPO
# learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
# n_steps = trial.suggest_int('n_steps', 2048, 8192, step=2048)
# batch_size = trial.suggest_categorical('batch_size', [512, 1024, 2048, 4096])  
# # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
# gamma = trial.suggest_float('gamma', 0.965, 0.98)
# gae_lambda = trial.suggest_float('gae_lambda', 0.9, 1.0)
# clip_range = trial.suggest_float('clip_range', 0.3, 0.5) 
# ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)



# V3 max 2130
 # Parametri dell'environment
# reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.05, 0.2)           # Default circa 0.1; esploriamo da 0.05 a 0.2
# forward_reward_weight = trial.suggest_float('forward_reward_weight', 1, 1.6)     # Default tipico è 1; esploriamo da 0.5 a 1.5
# ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.5, 1.2)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
# healthy_reward = trial.suggest_float('healthy_reward', 1.4, 1.9)                   # Default tipico 1; esploriamo da 0.5 a 1.5

# # Parametri aggiuntivi per Ant-v5
# contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-4, 1e-3)  # Es. range intorno a 5e-4 come default
# healthy_z_lower = trial.suggest_float('healthy_z_lower', 0, 0.2)             # Per definire l'intervallo di altezze "sane"
# healthy_z_upper = trial.suggest_float('healthy_z_upper', 0.9, 1.1)
# contact_force_min = trial.suggest_float('contact_force_min', -1.0, -0.5)         # Modificabile se usi forze di contatto
# contact_force_max = trial.suggest_float('contact_force_max', 0.5, 1.0)


# # Iperparametri per il modello PPO
# learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
# n_steps = trial.suggest_int('n_steps', 2048, 8192, step=2048)
# batch_size = trial.suggest_categorical('batch_size', [512, 1024, 2048, 4096])  
# # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
# gamma = trial.suggest_float('gamma', 0.96, 0.98)
# gae_lambda = trial.suggest_float('gae_lambda', 0.88, 0.99)
# clip_range = trial.suggest_float('clip_range', 0.1, 0.3) 
# ent_coef = trial.suggest_float('ent_coef', 0.0, 0.2)


# V4 max 2681 (BEST)
# # Parametri dell'environment
# reset_noise_scale = trial.suggest_float('reset_noise_scale', 0, 0.8)           # Default circa 0.1; esploriamo da 0.05 a 0.2
# forward_reward_weight = trial.suggest_float('forward_reward_weight', 1.4, 1.8)     # Default tipico è 1; esploriamo da 0.5 a 1.5
# ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 1.1, 1.5)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
# healthy_reward = trial.suggest_float('healthy_reward', 2, 2.4)                   # Default tipico 1; esploriamo da 0.5 a 1.5

# # Parametri aggiuntivi per Ant-v5
# contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-5, 1e-4)  # Es. range intorno a 5e-4 come default
# healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.25, 0.5)             # Per definire l'intervallo di altezze "sane"
# healthy_z_upper = trial.suggest_float('healthy_z_upper', 1, 1.3)
# contact_force_min = trial.suggest_float('contact_force_min', -1.2, -0.9)         # Modificabile se usi forze di contatto
# contact_force_max = trial.suggest_float('contact_force_max', 0.9, 1.2)


# # Iperparametri per il modello PPO
# learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
# n_steps = trial.suggest_int('n_steps', 4096, 12288, step=2048)
# batch_size = trial.suggest_categorical('batch_size', [256, 512, 1024, 2048])  
# # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
# gamma = trial.suggest_float('gamma', 0.93, 0.96)
# gae_lambda = trial.suggest_float('gae_lambda', 0.95, 0.98)
# clip_range = trial.suggest_float('clip_range', 0, 0.2) 
# ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)


# V4
# # Parametri dell'environment
# reset_noise_scale = trial.suggest_float('reset_noise_scale', 0, 0.3)           # Default circa 0.1; esploriamo da 0.05 a 0.2
# forward_reward_weight = trial.suggest_float('forward_reward_weight', 1.6, 1.9)     # Default tipico è 1; esploriamo da 0.5 a 1.5
# ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 1.2, 1.6)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
# healthy_reward = trial.suggest_float('healthy_reward', 2.1, 2.5)                   # Default tipico 1; esploriamo da 0.5 a 1.5

# # Parametri aggiuntivi per Ant-v5
# contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-6, 1e-4)  # Es. range intorno a 5e-4 come default
# healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.4)             # Per definire l'intervallo di altezze "sane"
# healthy_z_upper = trial.suggest_float('healthy_z_upper', 1.1, 1.4)
# contact_force_min = trial.suggest_float('contact_force_min', -1.3, -1)         # Modificabile se usi forze di contatto
# contact_force_max = trial.suggest_float('contact_force_max', 0.8, 1.1)

# # Iperparametri per il modello PPO
# learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
# n_steps = trial.suggest_int('n_steps', 4096, 12288, step=2048)
# batch_size = trial.suggest_categorical('batch_size', [256, 512, 1024, 2048])  
# # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
# gamma = trial.suggest_float('gamma', 0.93, 0.96)
# gae_lambda = trial.suggest_float('gae_lambda', 0.95, 0.98)
# clip_range = trial.suggest_float('clip_range', 0, 0.2) 
# ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)

In [8]:
# Hyperparameter tuning con Optuna

def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0, 0.8)           # Default circa 0.1; esploriamo da 0.05 a 0.2
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 1.4, 1.8)     # Default tipico è 1; esploriamo da 0.5 a 1.5
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 1.1, 1.5)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
    healthy_reward = trial.suggest_float('healthy_reward', 2, 2.4)                   # Default tipico 1; esploriamo da 0.5 a 1.5

    # Parametri aggiuntivi per Ant-v5
    contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-5, 1e-4)  # Es. range intorno a 5e-4 come default
    healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.25, 0.5)             # Per definire l'intervallo di altezze "sane"
    healthy_z_upper = trial.suggest_float('healthy_z_upper', 1, 1.3)
    contact_force_min = trial.suggest_float('contact_force_min', -1.2, -0.9)         # Modificabile se usi forze di contatto
    contact_force_max = trial.suggest_float('contact_force_max', 0.9, 1.2)

    # Crea l'ambiente passando tutti i parametri
    # env = make_env(
    #     reset_noise_scale,
    #     forward_reward_weight,
    #     ctrl_cost_weight,
    #     healthy_reward,
    #     contact_cost_weight=contact_cost_weight,
    #     healthy_z_range=(healthy_z_lower, healthy_z_upper),
    #     contact_force_range=(contact_force_min, contact_force_max)
    # )
    #env = DummyVecEnv([lambda: env])

    # MULTIPROCESSING (MULTIENVIRONMENTS) 
    NUM_ENVS=4
    env = SubprocVecEnv([
        lambda: make_env(
            reset_noise_scale,
            forward_reward_weight,
            ctrl_cost_weight,
            healthy_reward,
            contact_cost_weight=contact_cost_weight,
            healthy_z_range=(healthy_z_lower, healthy_z_upper),
            contact_force_range=(contact_force_min, contact_force_max)
        ) for _ in range(NUM_ENVS)
    ])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
    

    env.training = False # Setta l'environment in modalità di valutazione
    env.norm_reward = False # Disabilita la normalizzazione della reward. Questo è importante per valutare correttamente il modello.
    

    # Iperparametri per il modello PPO
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-5, log=True)
    n_steps = trial.suggest_int('n_steps', 4096, 12288, step=2048)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 128, 256 ])  
    # Per ambienti complessi come Ant, molti esperimenti usano gamma intorno a 0.99-0.995
    gamma = trial.suggest_float('gamma', 0.93, 0.96)
    gae_lambda = trial.suggest_float('gae_lambda', 0.95, 0.98)
    clip_range = trial.suggest_float('clip_range', 0, 0.2) 
    ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)
    
    # Nuovo iperparametro per la penalizzazione della varianza
    # std_penalty_weight = trial.suggest_float('std_penalty_weight', 0.0, 0.5)



    # Crea ed allena il modello PPO
    model = PPO("MlpPolicy", env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gamma=gamma,
                gae_lambda=gae_lambda,
                clip_range=clip_range,
                ent_coef=ent_coef,
                seed=42,
                verbose=0)
    model.learn(total_timesteps=200000)

    # Valuta il modello su 200 episodi (200 è ottimale)
    episodes = 200

    # episode_rewards = []
    # for episode in range(episodes):
    #     obs = env.reset()
    #     done = False
    #     episode_reward = 0
    #     while not done:
    #         action, _states = model.predict(obs)
    #         obs, reward, done, info = env.step(action)
    #         episode_reward += reward
    #     episode_rewards.append(episode_reward)

    # # Calcola reward media e varianza
    # mean_reward = np.mean(episode_rewards)
    # reward_std = np.std(episode_rewards)

    # # Definisce l'obiettivo: massimizzare la reward media penalizzando la varianza
    # score = mean_reward - std_penalty_weight * reward_std

    # print(f'Mean is: {mean_reward}, Std is: {reward_std}\n')



    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=episodes)

    # Chiudi e rilascia le risorse
    env.close()
    del model, env
    gc.collect()

    return mean_reward

# Crea uno studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)

[I 2025-02-20 19:14:25,331] A new study created in memory with name: no-name-1b93dcd5-99d3-4797-9441-e605c47522f2
[I 2025-02-20 19:16:01,225] Trial 0 finished with value: 2140.2242607528237 and parameters: {'reset_noise_scale': 0.16799371315604486, 'forward_reward_weight': 1.751842416738293, 'ctrl_cost_weight': 1.2259718573283096, 'healthy_reward': 2.127319115760623, 'contact_cost_weight': 4.224653743255615e-05, 'healthy_z_lower': 0.28082572647138015, 'healthy_z_upper': 1.0521980251917054, 'contact_force_min': -0.9990275797378833, 'contact_force_max': 1.100349317954894, 'learning_rate': 1.547836525186356e-06, 'n_steps': 6144, 'batch_size': 32, 'gamma': 0.9579964965076564, 'gae_lambda': 0.9715001079534059, 'clip_range': 0.1376091429260501, 'ent_coef': 0.004799121542302898}. Best is trial 0 with value: 2140.2242607528237.
[I 2025-02-20 19:16:47,373] Trial 1 finished with value: 673.8053440191014 and parameters: {'reset_noise_scale': 0.6324186245714162, 'forward_reward_weight': 1.49910255

Best hyperparameters:  {'reset_noise_scale': 0.036384281716755174, 'forward_reward_weight': 1.6734584377802377, 'ctrl_cost_weight': 1.4114977503409765, 'healthy_reward': 2.3778485263135485, 'contact_cost_weight': 5.2035838720379083e-05, 'healthy_z_lower': 0.28072846666427437, 'healthy_z_upper': 1.0352095932018308, 'contact_force_min': -1.1644341511298375, 'contact_force_max': 1.0234607887152494, 'learning_rate': 3.1894181556364527e-06, 'n_steps': 10240, 'batch_size': 16, 'gamma': 0.9594058727562937, 'gae_lambda': 0.9620126823049164, 'clip_range': 0.14604633707952877, 'ent_coef': 0.06672931495998022}
