In [12]:
import gymnasium as gym
from stable_baselines3 import SAC, PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import tensorboard
import optuna
import gc

# from stable_baselines3_jax import SAC
# from stable_baselines3_jax.common.evaluation import evaluate_policy
# from stable_baselines3_jax.common.vec_env import SubprocVecEnv, VecNormalize


In [13]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward, contact_cost_weight, healthy_z_range, contact_force_range):
    """
    Crea e restituisce l'ambiente Ant-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    # Crea l'ambiente Ant-v5 con i parametri forniti
    return gym.make("Ant-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight, 
                    healthy_reward=healthy_reward, 
                    contact_cost_weight=contact_cost_weight,
                    healthy_z_range=healthy_z_range,
                    contact_force_range=contact_force_range)
                   # render_mode='none')  # Opzionalmente, puoi specificare il render mode

In [14]:
# Hyperparameter tuning con Optuna

def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0, 0.3)           # Default circa 0.1; esploriamo da 0.05 a 0.2
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 1.6, 1.9)     # Default tipico è 1; esploriamo da 0.5 a 1.5
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 1.2, 1.6)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
    healthy_reward = trial.suggest_float('healthy_reward', 2.1, 2.5)                   # Default tipico 1; esploriamo da 0.5 a 1.5

    # Parametri aggiuntivi per Ant-v5
    contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-6, 1e-4)  # Es. range intorno a 5e-4 come default
    healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.4)             # Per definire l'intervallo di altezze "sane"
    healthy_z_upper = trial.suggest_float('healthy_z_upper', 1.1, 1.4)
    contact_force_min = trial.suggest_float('contact_force_min', -1.3, -1)         # Modificabile se usi forze di contatto
    contact_force_max = trial.suggest_float('contact_force_max', 0.8, 1.1)

    # Crea l'ambiente passando tutti i parametri
    NUM_ENVS=6
    env = SubprocVecEnv([
        lambda: make_env(
            reset_noise_scale,
            forward_reward_weight,
            ctrl_cost_weight,
            healthy_reward,
            contact_cost_weight=contact_cost_weight,
            healthy_z_range=(healthy_z_lower, healthy_z_upper),
            contact_force_range=(contact_force_min, contact_force_max)
        ) for _ in range(NUM_ENVS)
    ])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    env.training = False # Setta l'environment in modalità di valutazione
    env.norm_reward = False # Disabilita la normalizzazione della reward. Questo è importante per valutare correttamente il modello.

    # ---------------------------
    # Iperparametri per il modello SAC
    # ---------------------------
    # Parametri di ottimizzazione
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    learning_starts = trial.suggest_int('learning_starts', 1000, 10000, step=1000)
    batch_size = trial.suggest_categorical('batch_size', [256, 512, 1024, 2048])  
    gamma = trial.suggest_float('gamma', 0.93, 0.96)
    tau = trial.suggest_float('tau', 0.005, 0.05)
    ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)
    train_freq = trial.suggest_int('train_freq', 1, 10)
    gradient_steps = trial.suggest_int('gradient_steps', 1, 10)

    # Crea ed allena il modello SAC con i parametri ottimizzati
    model = SAC("MlpPolicy", env,
                learning_rate=learning_rate,
                buffer_size=50000,
                learning_starts=learning_starts,
                batch_size=batch_size,
                gamma=gamma,
                tau=tau,
                ent_coef=ent_coef,
                train_freq=train_freq,
                gradient_steps=gradient_steps,
                seed=42,
                verbose=0)
    model.learn(total_timesteps=50000)

    # Valuta il modello su 200 episodi
    episodes = 200
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=episodes)

    # Chiudi e rilascia le risorse
    env.close()
    del model, env
    gc.collect()

    return mean_reward

# Crea uno studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)


[I 2025-02-17 10:35:47,241] A new study created in memory with name: no-name-da3b0d48-4966-4a80-9778-ccb6838a9742
[I 2025-02-17 10:36:33,526] Trial 0 finished with value: 601.6952262351025 and parameters: {'reset_noise_scale': 0.19119718607806552, 'forward_reward_weight': 1.6804491245737976, 'ctrl_cost_weight': 1.207769020927843, 'healthy_reward': 2.272478700295021, 'contact_cost_weight': 8.624552952429674e-05, 'healthy_z_lower': 0.38279248859253384, 'healthy_z_upper': 1.2171170680345178, 'contact_force_min': -1.0198298269633561, 'contact_force_max': 0.9574785640914556, 'learning_rate': 0.00024459622520117355, 'learning_starts': 7000, 'batch_size': 512, 'gamma': 0.9543705145678568, 'tau': 0.015441338990150507, 'ent_coef': 0.06876165247531794, 'train_freq': 8, 'gradient_steps': 2}. Best is trial 0 with value: 601.6952262351025.
[I 2025-02-17 10:37:51,214] Trial 1 finished with value: 396.30643822649336 and parameters: {'reset_noise_scale': 0.1181019218488185, 'forward_reward_weight': 1.

Best hyperparameters:  {'reset_noise_scale': 0.20141630762026747, 'forward_reward_weight': 1.6278491287471968, 'ctrl_cost_weight': 1.2763028508139123, 'healthy_reward': 2.399103414226093, 'contact_cost_weight': 5.2247028604018e-06, 'healthy_z_lower': 0.16940418341206293, 'healthy_z_upper': 1.3272275689129294, 'contact_force_min': -1.2110815310071597, 'contact_force_max': 0.8982646857233315, 'learning_rate': 0.0006839314025125863, 'learning_starts': 4000, 'batch_size': 512, 'gamma': 0.9508951442412019, 'tau': 0.019742481119711143, 'ent_coef': 0.012500206557457209, 'train_freq': 3, 'gradient_steps': 7}
