In [13]:
import gymnasium as gym
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
import imageio
import time

from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize, DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [14]:
# Numero di ambienti paralleli per il training
NUM_ENVS = 4

# Wrapper personalizzato per la ricompensa modificata
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.cappottato_start_time = None
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        torso_angle = self.env.unwrapped.data.qpos[2]
        
        if torso_angle < -0.7:
            if self.cappottato_start_time is None:
                self.cappottato_start_time = time.time()
            tempo_cappottato = time.time() - self.cappottato_start_time
            penalty = 50 * tempo_cappottato
            reward -= penalty
        else:
            self.cappottato_start_time = None
        
        return obs, reward, terminated, truncated, info

# Funzione per creare l'ambiente

def make_env():
    def _init():
        env = gym.make("HalfCheetah-v5",
                        reset_noise_scale=0.13635555699602933,
                        forward_reward_weight=0.7151140526343989,
                        ctrl_cost_weight=0.19342622590821706)
        env = Monitor(env)
        env = CustomRewardWrapper(env)
        return env
    return _init

# Creazione degli ambienti per il training
env = SubprocVecEnv([make_env() for _ in range(NUM_ENVS)])
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

# Selezione automatica del device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Parametri del modello SAC
model_params = {
    "policy": "MlpPolicy",
    "env": env,
    "learning_rate": 4.3539588088977104e-05,
    "buffer_size": 500000,
    "batch_size": 256,
    "tau": 0.013929154106819306,
    "gamma": 0.9843911115842067,
    "train_freq": 1,
    "gradient_steps": 8,
    "ent_coef": 0.001,
    "verbose": 1,
    "tensorboard_log": "./sac_HalfCheetah_tensorboard/",
    "device": device,
    "policy_kwargs": dict(net_arch=[256, 256, 128])
}

# Creazione dell'ambiente di valutazione
eval_env = DummyVecEnv([make_env()])
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10., training=False)

# Callback per valutazione e salvataggi
eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/best_model",
                             log_path="./logs/", eval_freq=5000, deterministic=True, render=False)
checkpoint_callback = CheckpointCallback(save_freq=5000, save_path="./logs/checkpoints/",
                                         name_prefix="sac_halfcheetah_checkpoint")

# Creazione e training del modello
model = SAC(**model_params)
model.learn(total_timesteps=1_500_000, callback=CallbackList([eval_callback, checkpoint_callback]))

# Salvataggio del modello e normalizzazione
model.save("sac_HalfCheetah_model")
env.save("vecnormalize_HalfCheetah.pkl")

# Caricamento del modello e della normalizzazione per la valutazione
model = SAC.load("sac_HalfCheetah_model", device=device)
eval_env = VecNormalize.load("vecnormalize_HalfCheetah.pkl", eval_env)
eval_env.training = False
eval_env.reset()

# Funzione per la valutazione
def evaluate_agent(model, env, episodes=100):
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=episodes, deterministic=True)
    print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
    return mean_reward, std_reward

# Valutazione del modello allenato
mean_reward_trained, std_reward_trained = evaluate_agent(model, eval_env, episodes=100)


Using cpu device
Logging to ./sac_HalfCheetah_tensorboard/SAC_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -256     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 82       |
|    time_elapsed    | 48       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | 4.5      |
|    critic_loss     | 0.03     |
|    ent_coef        | 0.001    |
|    learning_rate   | 4.35e-05 |
|    n_updates       | 7792     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -78.8    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 85       |
|    time_elapsed    | 93       |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -5.09    |
|    critic_loss  

In [15]:
#5. Salviamo il modello
model.save("sac_HalfCheetah_model")
env.save("vecnormalize_HalfCheetah.pkl")    # salviamo anche i parametri di normalizzazione
