In [7]:
import gymnasium as gym
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
import imageio

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize, DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [8]:
# Usa SubprocVecEnv per sfruttare il multiprocessing (più veloce di DummyVecEnv)
NUM_ENVS = 4  # Numero di ambienti paralleli per accelerare il training

# Definiamo la funzione per creare un ambiente vettorializzato
def make_env():
    return Monitor(gym.make("HalfCheetah-v5",
                            reset_noise_scale=0.0719410443033492,
                            forward_reward_weight=0.8079894174326131,
                            ctrl_cost_weight=0.47961956759514446,
                            render_mode='none'))

# Creiamo gli ambienti paralleli
env = SubprocVecEnv([make_env for _ in range(NUM_ENVS)])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

# Parametri del modello (puoi ottimizzarli con Optuna)
model_params = {
    "policy": "MlpPolicy",
    "env": env,
    "learning_rate": 1.582554022730496e-05,  # Valore tipico per HalfCheetah
    "n_steps": 4096,
    "batch_size": 64,
    "n_epochs": 10,
    "gamma": 0.9895868337827359,
    "gae_lambda": 0.8258089470360688,
    "clip_range": 0.2961352072414478,
    "ent_coef": 0.043318253089964606,
    "verbose": 1,
    "tensorboard_log": "./ppo_HalfCheetah_tensorboard/",
    "device": "mps",  # Usa GPU se disponibile
    "policy_kwargs": dict(net_arch=[256, 256, 128])
}

# Definiamo i callback per salvataggio e valutazione
eval_env = SubprocVecEnv([make_env for _ in range(NUM_ENVS)])
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)

eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/best_model",
                             log_path="./logs/", eval_freq=5000, deterministic=True, render=False)

checkpoint_callback = CheckpointCallback(save_freq=10000, save_path="./logs/checkpoints/",
                                         name_prefix="ppo_halfcheetah_checkpoint")

# Training del modello
model = PPO(**model_params)
model.learn(total_timesteps=1_000_000, callback=CallbackList([eval_callback, checkpoint_callback]))

# Salvataggio del modello e della normalizzazione
model.save("ppo_HalfCheetah_model")
env.save("vecnormalize_HalfCheetah.pkl")

# Funzione di valutazione migliorata
def evaluate_agent(model, env, episodes=100):
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=episodes, deterministic=True)
    print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
    return mean_reward, std_reward

# Valutiamo il modello addestrato
mean_reward_trained, std_reward_trained = evaluate_agent(model, env, episodes=100)


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Using mps device
Logging to ./ppo_HalfCheetah_tensorboard/PPO_3


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -1.52e+03 |
| time/              |           |
|    fps             | 1428      |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 16384     |
----------------------------------
Eval num_timesteps=20000, episode_reward=-2.75 +/- 0.51
Episode length: 1000.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 1e+03      |
|    mean_reward          | -2.75      |
| time/                   |            |
|    total_timesteps      | 20000      |
| train/                  |            |
|    approx_kl            | 0.00652959 |
|    clip_fraction        | 0.00558    |
|    clip_range           | 0.296      |
|    entropy_loss         | -8.53      |
|    explained_variance   | -0.777     |
|    learning_rate        | 1.58e-05   |
|    loss                 | 

In [9]:
#5. Salviamo il modello
model.save("ppo_Ant_model")
env.save("vecnormalize_Ant.pkl")  # salviamo anche i parametri di normalizzazione


In [5]:
import gymnasium as gym
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
import imageio

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize, DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
NUM_ENVS=2
def make_envv():
    return Monitor(gym.make("HalfCheetah-v5",
                            reset_noise_scale=0.0719410443033492,
                            forward_reward_weight=0.8079894174326131,
                            ctrl_cost_weight=0.47961956759514446,
                            render_mode='human'))

# Creiamo gli ambienti paralleli
env = SubprocVecEnv([make_envv for _ in range(NUM_ENVS)])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
# Funzione per visualizzare la policy in tempo reale e registrare il video
def render_and_record_policy(model_path, output_filename="videos/halfcheetah_best_policy.mp4", episodes=1):
    os.makedirs("videos", exist_ok=True)
    env = make_envv()
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load("vecnormalize_HalfCheetah.pkl", env)
    env.training = False  # Disabilita la normalizzazione della reward per la valutazione
    env.norm_reward = False
    
    model = PPO.load(model_path, env=env)
    obs = env.reset()
    frames = []
    
    for _ in range(episodes * 1000):  # Esegui abbastanza step per registrare un episodio completo
        action, _ = model.predict(obs, deterministic=True)
        obs, _, done, _ = env.step(action)
        frames.append(env.render(mode='rgb_array'))
        if done:
            obs = env.reset()
    
    imageio.mimsave(output_filename, frames, fps=30)
    print(f"Video salvato in {output_filename}")
    env.close()

# Registra un video della policy ottimale
render_and_record_policy("ppo_HalfCheetah_model")

2025-02-13 22:56:18.810 Python[18346:583589] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-13 22:56:18.810 Python[18346:583589] +[IMKInputSession subclass]: chose IMKInputSession_Modern
                We allow to pass a mode argument to maintain a backwards compatible VecEnv API, but the mode (rgb_array)
                has to be the same as the environment render mode (human) which is not the case.


ValueError: The image must have at least two spatial dimensions.

: 