In [75]:
import gymnasium as gym
import numpy as np
import os
import torch
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor


In [76]:

# Usa SubprocVecEnv per sfruttare il multiprocessing (più veloce di DummyVecEnv)
NUM_ENVS = 4  # Numero di ambienti paralleli per accelerare il training

# Definiamo la funzione per creare un ambiente vettorializzato
def make_env(healthy_z_lower=0.26133111370542855, healthy_z_upper= 1.0967413845523089,contact_force_min= -0.5801381783852236,contact_force_max=0.7870366711904808):
    return Monitor(gym.make("Ant-v5",
                            reset_noise_scale=0.08325455885769968,
                            forward_reward_weight=1.199616714651314,
                            ctrl_cost_weight=0.6668668001731599,
                            healthy_reward=1.0,
                            contact_cost_weight=5e-4,
                            healthy_z_range=(healthy_z_lower, healthy_z_upper),
                            contact_force_range=(contact_force_min, contact_force_max),
                            render_mode='none'))

# Creiamo gli ambienti paralleli
env = SubprocVecEnv([make_env for _ in range(NUM_ENVS)])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [77]:
# Parametri del modello (puoi ottimizzarli con Optuna)

model_params = {
    "policy": "MlpPolicy",
    "env": env,
    "learning_rate": 1.3481946009485854e-05,  # Usa Optuna per trovare il migliore
    "n_steps": 6144,
    "batch_size": 128,
    "n_epochs": 10,
    "gamma": 0.9955878438529644,
    "gae_lambda": 0.9445556509709931,
    "clip_range": 0.13789333358719322,
    "ent_coef": 0.03301914704109688,
    "verbose": 1,
    "tensorboard_log": "./ppo_Ant_tensorboard/",
    "device": "mps"  # Usa GPU se disponibile
    "policy_kwargs": dict(net_arch=[256, 256, 128])
}

# Definiamo i callback per salvataggio e valutazione
eval_env = SubprocVecEnv([make_env for _ in range(NUM_ENVS)])
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)

eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/best_model",
                             log_path="./logs/", eval_freq=70000, deterministic=True, render=False)

checkpoint_callback = CheckpointCallback(save_freq=10000, save_path="./logs/checkpoints/",
                                         name_prefix="ppo_ant_checkpoint")

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [78]:
# Training del modello
model = PPO(**model_params)
model.learn(total_timesteps=1_000_000, callback=CallbackList([eval_callback, checkpoint_callback]))

# Salvataggio del modello e della normalizzazione
model.save("ppo_Ant_model")
env.save("vecnormalize_Ant.pkl")

# Funzione di valutazione migliorata
def evaluate_agent(model, env, episodes=100):
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=episodes, deterministic=True)
    print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
    return mean_reward, std_reward

# Valutiamo il modello addestrato
mean_reward_trained, std_reward_trained = evaluate_agent(model, env, episodes=100)

Using mps device
Logging to ./ppo_Ant_tensorboard/PPO_7


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60.1     |
|    ep_rew_mean     | -105     |
| time/              |          |
|    fps             | 1359     |
|    iterations      | 1        |
|    time_elapsed    | 18       |
|    total_timesteps | 24576    |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 63.1          |
|    ep_rew_mean          | -110          |
| time/                   |               |
|    fps                  | 914           |
|    iterations           | 2             |
|    time_elapsed         | 53            |
|    total_timesteps      | 49152         |
| train/                  |               |
|    approx_kl            | 0.00083612435 |
|    clip_fraction        | 0.00144       |
|    clip_range           | 0.138         |
|    entropy_loss         | -11.4         |
|    explained_variance   | -1.4          |


In [None]:
#5. Salviamo il modello
model.save("ppo_Ant_model")
env.save("vecnormalize_Ant.pkl")  # salviamo anche i parametri di normalizzazione
