In [7]:
import gymnasium as gym
from stable_baselines3 import SAC, PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import numpy as np
import tensorboard
import optuna

In [8]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward, contact_cost_weight, healthy_z_range, contact_force_range):
    """
    Crea e restituisce l'ambiente Ant-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    # Ant-v5 è l’ambiente più recente in Gymnasium.
    return gym.make("Ant-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight, 
                    healthy_reward=healthy_reward, 
                    contact_cost_weight = contact_cost_weight,
                    healthy_z_range=healthy_z_range,
                    contact_force_range=contact_force_range)
                   # render_mode='none')

In [9]:
# Hyperparameter tuning con Optuna

def objective(trial):
    # Parametri dell'environment
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.05, 0.2)           # Default circa 0.1; esploriamo da 0.05 a 0.2
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.5, 1.5)     # Default tipico è 1; esploriamo da 0.5 a 1.5
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.1, 1.0)               # Default tipico 0.5; esploriamo da 0.1 a 1.0
    healthy_reward = trial.suggest_float('healthy_reward', 0.5, 1.5)                   # Default tipico 1; esploriamo da 0.5 a 1.5
    
    # Parametri aggiuntivi per Ant-v5
    contact_cost_weight = trial.suggest_float('contact_cost_weight', 1e-4, 1e-3)  # Es. range intorno a 5e-4 come default
    healthy_z_lower = trial.suggest_float('healthy_z_lower', 0.1, 0.3)             # Per definire l'intervallo di altezze "sane"
    healthy_z_upper = trial.suggest_float('healthy_z_upper', 0.8, 1.2)
    contact_force_min = trial.suggest_float('contact_force_min', -1.0, -0.5)         # Modificabile se usi forze di contatto
    contact_force_max = trial.suggest_float('contact_force_max', 0.5, 1.0)  

    # Crea l'ambiente passando tutti i parametri
    env = make_env(
        reset_noise_scale,
        forward_reward_weight,
        ctrl_cost_weight,
        healthy_reward,
        contact_cost_weight=contact_cost_weight,
        healthy_z_range=(healthy_z_lower, healthy_z_upper),
        contact_force_range=(contact_force_min, contact_force_max)
    )
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

    # ---------------------------
    # Iperparametri per il modello SAC
    # ---------------------------
    # Parametri di ottimizzazione
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    # Buffer di replay: tipicamente per SAC si usano grandi dimensioni
    #buffer_size = trial.suggest_int('buffer_size', 50000, 1000000, step=50000)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])
    gamma = trial.suggest_float('gamma', 0.99, 0.999)
    # Tau per il soft update della target network (molto sensibile in SAC)
    tau = trial.suggest_float('tau', 0.005, 0.05)
    ent_coef = trial.suggest_float('ent_coef', 0.0, 0.1)
    # Frequenza e passi di aggiornamento: controllano quanti step di gradient descent effettuare
    train_freq = trial.suggest_int('train_freq', 1, 10)
    gradient_steps = trial.suggest_int('gradient_steps', 1, 10)

    # Iperparametro per penalizzare la varianza dei reward, utile per evitare soluzioni troppo instabili
    variance_penalty_weight = trial.suggest_float('variance_penalty_weight', 0.0, 0.5)

    # Crea ed allena il modello SAC con i parametri ottimizzati
    model = SAC("MlpPolicy", env,
                learning_rate=learning_rate,
                buffer_size=5000,
                batch_size=batch_size,
                gamma=gamma,
                tau=tau,
                ent_coef=ent_coef,
                train_freq=train_freq,
                gradient_steps=gradient_steps,
                verbose=0)
    model.learn(total_timesteps=200000)

    # Valuta il modello su 200 episodi (200 è ottimale)
    episodes = 150
    episode_rewards = []
    for episode in range(episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
        episode_rewards.append(episode_reward)

    # Calcola reward media e varianza
    mean_reward = np.mean(episode_rewards)
    reward_variance = np.var(episode_rewards)

    # Definisce l'obiettivo: massimizzare la reward media penalizzando la varianza
    score = mean_reward - variance_penalty_weight * reward_variance

    print(f'Mean is: {mean_reward}, Variance is: {reward_variance}\n')

    return score

# Crea uno studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)

[I 2025-02-11 15:42:55,379] A new study created in memory with name: no-name-d95f78e6-1d10-4b08-b40a-523eead60d32
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[W 2025-02-11 15:48:38,564] Trial 0 failed with parameters: {'reset_noise_scale': 0.11533401269087494, 'forward_reward_weight': 0.5400519325083708, 'ctrl_cost_weight': 0.5339821578257153, 'healthy_reward': 1.0252369037734363, 'contact_cost_weight': 0.0009749863319379205, 'healthy_z_lower': 0.11786367250103073, 'healthy_z_upper': 0.934977885900501, 'contact_force_min': -0.7268142870869859, 'contact_force_max': 0.5723674962536733, 'learning_rate': 0.00027561669101076636, 'batch_size': 64, 'gamma': 0.9903695671923175, 'tau': 0.011297184075772721, 'ent_coef': 0.08025750126406664, 'train_freq': 2, 'gradient_steps': 10, 'variance_penalty_weight': 0.04058657056713022} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/ignazioemanuelepicciche/Documents/Ignazio

KeyboardInterrupt: 