In [7]:
import gymnasium as gym
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import tensorboard
import optuna
import gc
from stable_baselines3.common.noise import NormalActionNoise


In [8]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward, contact_cost_weight, healthy_z_range, contact_force_range):
    """
    Crea e restituisce l'ambiente Ant-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    # Ant-v5 è l’ambiente più recente in Gymnasium.
    return gym.make("Ant-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight, 
                    healthy_reward=healthy_reward, 
                    contact_cost_weight = contact_cost_weight,
                    healthy_z_range=healthy_z_range,
                    contact_force_range=contact_force_range)
                   # render_mode='none')

In [9]:
def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0, 0.3)
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 1.6, 1.9)
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 1.2, 1.6)
    healthy_reward = trial.suggest_float('healthy_reward', 2.1, 2.5)
    contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-6, 1e-4)
    healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.4)
    healthy_z_upper = trial.suggest_float('healthy_z_upper', 1.1, 1.4)
    contact_force_min = trial.suggest_float('contact_force_min', -1.3, -1)
    contact_force_max = trial.suggest_float('contact_force_max', 0.8, 1.1)

    NUM_ENVS = 6
    env = SubprocVecEnv([
        lambda: make_env(
            reset_noise_scale,
            forward_reward_weight,
            ctrl_cost_weight,
            healthy_reward,
            contact_cost_weight, 
            healthy_z_range=(healthy_z_lower, healthy_z_upper),
            contact_force_range=(contact_force_min, contact_force_max)
        ) for _ in range(NUM_ENVS)
    ])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    env.training = False
    env.norm_reward = False

    # Parametri di ottimizzazione per TD3
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    learning_starts = trial.suggest_int('learning_starts', 1000, 10000, step=1000)
    batch_size = trial.suggest_categorical('batch_size', [256, 512, 1024, 2048])  
    gamma = trial.suggest_float('gamma', 0.93, 0.96)
    tau = trial.suggest_float('tau', 0.005, 0.05)
    noise_std = trial.suggest_float('noise_std', 0.1, 0.5)  # Stddev per NormalActionNoise
    noise_clip = trial.suggest_float('noise_clip', 0.2, 0.5)
    policy_delay = trial.suggest_int('policy_delay', 1, 3)
    train_freq = trial.suggest_int('train_freq', 1, 10)
    gradient_steps = trial.suggest_int('gradient_steps', 1, 10)

    # Definiamo il noise per l'azione
    action_noise = NormalActionNoise(mean=np.zeros(8), sigma=noise_std * np.ones(8))

    model = TD3("MlpPolicy", env,
                learning_rate=learning_rate,
                buffer_size=50000,
                learning_starts=learning_starts,
                batch_size=batch_size,
                gamma=gamma,
                tau=tau,
                action_noise=action_noise,
                policy_delay=policy_delay,
                train_freq=train_freq,
                gradient_steps=gradient_steps,
                seed=42,
                verbose=0)
    model.learn(total_timesteps=50000)

    episodes = 200
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=episodes)

    env.close()
    del model, env
    gc.collect()

    return mean_reward

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

print("Best hyperparameters: ", study.best_params)

[I 2025-02-18 09:34:26,458] A new study created in memory with name: no-name-d41f3e7c-b192-488e-92c1-47df582b5a34
[I 2025-02-18 09:35:56,316] Trial 0 finished with value: 1908.7595740269455 and parameters: {'reset_noise_scale': 0.00897325206209999, 'forward_reward_weight': 1.742233833714926, 'ctrl_cost_weight': 1.4779971438910726, 'healthy_reward': 2.1232070898906956, 'contact_cost_weight': 9.672108021212856e-06, 'healthy_z_lower': 0.30548411186006724, 'healthy_z_upper': 1.2978014501390338, 'contact_force_min': -1.2030516632763952, 'contact_force_max': 1.0144279949808082, 'learning_rate': 0.0006235893581706442, 'learning_starts': 6000, 'batch_size': 1024, 'gamma': 0.9477150142917476, 'tau': 0.04045128932455767, 'noise_std': 0.4441567435302099, 'noise_clip': 0.24980649346041434, 'policy_delay': 3, 'train_freq': 6, 'gradient_steps': 4}. Best is trial 0 with value: 1908.7595740269455.
[I 2025-02-18 09:36:34,045] Trial 1 finished with value: -1805.9762292411515 and parameters: {'reset_nois

KeyboardInterrupt: 