In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import numpy as np
import tensorboard
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward):
    """
    Crea e restituisce l'ambiente Ant-v5 dalla libreria Gymnasium con i parametri specificati.
    """
    # Ant-v5 è l’ambiente più recente in Gymnasium.
    return gym.make("Ant-v5", 
                    reset_noise_scale=reset_noise_scale, 
                    forward_reward_weight=forward_reward_weight, 
                    ctrl_cost_weight=ctrl_cost_weight, 
                    healthy_reward=healthy_reward, 
                    render_mode='None')

In [3]:
# Hyperparameter tuning con Optuna

def objective(trial):
    # Definisci lo spazio di ricerca degli iperparametri
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.0, 1.0)
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.0, 1.0)
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.0, 1.0)
    healthy_reward = trial.suggest_float('healthy_reward', 0.0, 1.0)
    
    # Crea l'ambiente con gli iperparametri suggeriti
    env = make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight, healthy_reward)
    env = DummyVecEnv([lambda: env])
    
    # Definisci altri iperparametri del modello
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    n_steps = trial.suggest_int('n_steps', 2048, 8192, step=2048)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])
    gamma = trial.suggest_float('gamma', 0.9, 0.9999)
    
    # Crea il modello PPO
    model = PPO("MlpPolicy", env, learning_rate=learning_rate, n_steps=n_steps, batch_size=batch_size, gamma=gamma, verbose=0)
    
    # Addestra il modello
    model.learn(total_timesteps=100000)
    
    # Valuta il modello
    episodes = 100  # Aumenta il numero di episodi per una valutazione più accurata
    episode_rewards = []
    for episode in range(episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
        episode_rewards.append(episode_reward)
    
    # Calcola la media e la varianza delle ricompense
    mean_reward = np.mean(episode_rewards)
    reward_variance = np.var(episode_rewards)
    
    # Ritorna una combinazione della media e della varianza delle ricompense
    return mean_reward / (1 + reward_variance)

# Crea uno studio Optuna e ottimizza l'obiettivo
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Stampa i migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)

[I 2025-02-11 12:30:41,293] A new study created in memory with name: no-name-dce1941a-a680-46a9-9b98-54e8e95f1e3e
  logger.warn(
[I 2025-02-11 12:31:23,033] Trial 0 finished with value: -0.0005831763904471871 and parameters: {'reset_noise_scale': 0.007573013223979852, 'forward_reward_weight': 0.25072123296798066, 'ctrl_cost_weight': 0.9870528948091156, 'healthy_reward': 0.44155366131043206, 'learning_rate': 7.23223076932237e-05, 'n_steps': 2048, 'batch_size': 128, 'gamma': 0.9419710212204195}. Best is trial 0 with value: -0.0005831763904471871.
  logger.warn(
[I 2025-02-11 12:31:58,253] Trial 1 finished with value: -0.0012623290231790513 and parameters: {'reset_noise_scale': 0.6170235170302988, 'forward_reward_weight': 0.35792571906138904, 'ctrl_cost_weight': 0.5877793653802755, 'healthy_reward': 0.07396913381156855, 'learning_rate': 0.0004307334101658675, 'n_steps': 2048, 'batch_size': 256, 'gamma': 0.9676706294610165}. Best is trial 0 with value: -0.0005831763904471871.
[I 2025-02-11

Best hyperparameters:  {'reset_noise_scale': 0.07467935946049155, 'forward_reward_weight': 0.07619282247649807, 'ctrl_cost_weight': 0.9714884866781529, 'healthy_reward': 0.7882423009290584, 'learning_rate': 0.0005596010992131322, 'n_steps': 2048, 'batch_size': 256, 'gamma': 0.91848669540811}
