In [13]:
import gymnasium as gym
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
import imageio
import time

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize, DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [14]:
# Numero di ambienti paralleli per il training
NUM_ENVS = 4

def make_env():
    return lambda: gym.make("HalfCheetah-v5", 
                            reset_noise_scale=0.18925327466415615,
                            forward_reward_weight=1.158890288504633,
                            ctrl_cost_weight=0.05108108521573771,
                            render_mode='none')

# Creazione degli ambienti per il training (DummyVecEnv per compatibilità con VecNormalize)
env = DummyVecEnv([make_env() for _ in range(NUM_ENVS)])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

# Selezione automatica del device (GPU/CPU)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Parametri del modello migliorati
model_params = {
    "policy": "MlpPolicy",
    "env": env,
    "learning_rate": 8.272618650819588e-05,
    "n_steps": 2048,
    "batch_size": 256,  # Multiplo di n_steps per stabilità
    "n_epochs": 10,
    "gamma": 0.9808272185952741,
    "gae_lambda": 0.9080997013001573,
    "clip_range": 0.2,  # Ridotto per maggiore stabilità
    "ent_coef": 0.05,  # Aumentato per migliorare esplorazione
    "verbose": 1,
    "tensorboard_log": "./ppo_HalfCheetah_tensorboard/",
    "device": "cpu",
    "policy_kwargs": dict(net_arch=[256, 256, 128])
}

# Funzione per creare l'ambiente di valutazione
def make_eval_env():
    return lambda: Monitor(gym.make("HalfCheetah-v5", 
                                    reset_noise_scale=0.18925327466415615,
                                    forward_reward_weight=1.158890288504633,
                                    ctrl_cost_weight=0.05108108521573771,
                                    render_mode='none'))

# Creazione dell'ambiente di valutazione
eval_env = DummyVecEnv([make_eval_env()])
eval_env = VecNormalize.load("vecnormalize_HalfCheetah.pkl", eval_env)

# Callback per valutazione e salvataggi
eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/best_model",
                             log_path="./logs/", eval_freq=20000, deterministic=True, render=False)
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path="./logs/checkpoints/",
                                         name_prefix="ppo_halfcheetah_checkpoint")

# Creazione e training del modello
model = PPO(**model_params)
model.learn(total_timesteps=1_000_000, callback=CallbackList([eval_callback, checkpoint_callback]))

# Salvataggio del modello e normalizzazione
model.save("ppo_HalfCheetah_model")
env.save("vecnormalize_HalfCheetah.pkl")

# Caricamento del modello e della normalizzazione per la valutazione
model = PPO.load("ppo_HalfCheetah_model", device=device)
eval_env = VecNormalize.load("vecnormalize_HalfCheetah.pkl", eval_env)
eval_env.training = False

def evaluate_agent(model, env, episodes=100):
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=episodes, deterministic=True)
    print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
    return mean_reward, std_reward

# Valutazione del modello allenato
mean_reward_trained, std_reward_trained = evaluate_agent(model, eval_env, episodes=100)


Using cpu device
Logging to ./ppo_HalfCheetah_tensorboard/PPO_12
------------------------------
| time/              |       |
|    fps             | 15666 |
|    iterations      | 1     |
|    time_elapsed    | 1     |
|    total_timesteps | 16384 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 8889        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.008369913 |
|    clip_fraction        | 0.0599      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.57       |
|    explained_variance   | -1.09       |
|    learning_rate        | 8.27e-05    |
|    loss                 | -0.385      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00546    |
|    std                  | 1.02        |
|   

In [15]:
#5. Salviamo il modello
model.save("ppo_HalfCheetah_model")
env.save("vecnormalize_HalfCheetah.pkl")    # salviamo anche i parametri di normalizzazione
