In [1]:
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np
import os

class MultiSeedEvalCallback(BaseCallback):
    """
    Custom callback that evaluates the model on multiple seeds and averages the results.
    """
    def __init__(
        self,
        eval_env,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        n_eval_seeds: int = 3,
        log_path: str = None,
        best_model_save_path: str = None,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(verbose=verbose)
        self.eval_env = eval_env
        self.n_eval_episodes = n_eval_episodes
        self.eval_freq = eval_freq
        self.n_eval_seeds = n_eval_seeds
        self.best_model_save_path = best_model_save_path
        self.log_path = log_path
        self.deterministic = deterministic
        self.best_mean_reward = -np.inf
        
        if self.log_path is not None:
            os.makedirs(log_path, exist_ok=True)
            self.log_file = open(os.path.join(log_path, "eval_logs.txt"), "w")
        
        if self.best_model_save_path is not None:
            os.makedirs(best_model_save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate on multiple seeds
            seed_rewards = []
            for seed in range(self.n_eval_seeds):
                rewards = []
                for _ in range(self.n_eval_episodes):
                    obs = self.eval_env.reset()
                    self.eval_env.seed(seed)
                    done = False
                    episode_reward = 0.0
                    while not done:
                        action, _ = self.model.predict(obs, deterministic=self.deterministic)
                        obs, reward, done, info = self.eval_env.step(action)
                        episode_reward += reward
                    rewards.append(episode_reward)
                
                seed_mean_reward = np.mean(rewards)
                seed_rewards.append(seed_mean_reward)
            
            # Calculate average across seeds
            mean_reward = np.mean(seed_rewards)
            std_reward = np.std(seed_rewards)
            
            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")
            
            # Log to file if path provided
            if self.log_path is not None:
                self.log_file.write(f"{self.num_timesteps},{mean_reward},{std_reward}\n")
                self.log_file.flush()
            
            # Save best model
            if mean_reward > self.best_mean_reward and self.best_model_save_path is not None:
                self.best_mean_reward = mean_reward
                if self.verbose > 0:
                    print(f"New best mean reward: {mean_reward:.2f} - Saving model")
                self.model.save(os.path.join(self.best_model_save_path, "best_model"), include = ["env"])
        
        return True

    def _on_training_end(self) -> None:
        if self.log_path is not None:
            self.log_file.close()

In [None]:
import gymnasium as gym
import numpy as np
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
import matplotlib.pyplot as plt
from stable_baselines3 import SAC
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
import os
import torch
import warnings
from torch.utils.tensorboard import SummaryWriter

warnings.filterwarnings('ignore')

# Define the environment names
ENV_NAMES = {
    'pendulum': 'Pendulum-v1',
    'mountaincar': 'MountainCarContinuous-v0'
}

# 1. Define the evaluation function
def evaluate_model(model, env, n_episodes=10):
    """
    Evaluate a RL model
    :param model: (BaseAlgorithm) the RL agent
    :param env: (gym.Env) the gym environment
    :param n_episodes: (int) number of episodes to evaluate
    :return: (float) mean reward
    """
    rewards = []
    for i in range(3):
        obs = env.reset()
        env.seed(i)
        episode_rewards = []

        for j in range(n_episodes):
            obs = env.reset()
            done = False
            episode_reward = 0
            while not done:
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, terminated, truncated = env.step(action)
                done = terminated or truncated
                episode_reward += reward
            episode_rewards.append(episode_reward)
        rewards.append(np.mean(episode_rewards))
    return np.mean(rewards)


# Objective function for optimization
def objective(trial):
    # Sample hyperparameters
    hyperparams = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256]),
        'buffer_size': trial.suggest_categorical('buffer_size', [int(1e3), int(1e4), int(1e5), int(1e6), int(1e7)]),
        'learning_starts': trial.suggest_categorical('learning_starts', [100, 500, 1000, 5000]),
        'train_freq': trial.suggest_categorical('train_freq', [1, 2, 5, 10]),
        'gradient_steps': trial.suggest_int('gradient_steps', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.9, 0.9999),
        'tau': trial.suggest_float('tau', 0.001, 0.02), 
        'ent_coef':  trial.suggest_float('learning_rate', 1e-3, 1e2, log=True),
        'policy_kwargs': {
            'net_arch': trial.suggest_categorical('net_arch', [
                [64, 64], 
                [256, 256],
                [400, 300]
            ])
        }
    }
    env = DummyVecEnv([lambda: Monitor(gym.make(env_name))])

    # Add noise configuration
    noise_type = trial.suggest_categorical('noise_type', ['normal', 'ou'])
    if noise_type != 'none':
        noise_std = trial.suggest_float('noise_std', 0.2, 0.7)
        if noise_type == 'normal':
            action_noise = NormalActionNoise(mean=np.zeros(env.action_space.shape), 
                                           sigma=noise_std * np.ones(env.action_space.shape))
        else:
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.action_space.shape), 
                                                       sigma=noise_std * np.ones(env.action_space.shape))
    else:
        action_noise = None

    # Create environment
    print(f"Environment: {env_name}")
    # Create model
    model = SAC(
        "MlpPolicy", 
        env, 
        verbose=0, 
        action_noise=action_noise,
        tensorboard_log=f"./logs/SAC_{env_name}/trial_{trial.number}",
        **hyperparams
    )

    # Train with evaluation callback
    eval_callback = MultiSeedEvalCallback(
        eval_env=env,
        n_eval_episodes=5,          # Number of episodes per seed
        eval_freq=10000,             
        n_eval_seeds=3,             # Number of different seeds to test
        log_path=f"./logs/SAC_{env_name}/trial_{trial.number}/",
        best_model_save_path=f"./logs/SAC_{env_name}/trial_{trial.number}/",
        deterministic=True,
        verbose=1
    )
    
    try:
        if env_name == 'Pendulum-v1':
            total_timesteps = 30000
        else:
            total_timesteps = 300000
        model.learn(total_timesteps=total_timesteps, callback=eval_callback)
        mean_reward = evaluate_model(model, env, n_episodes=10)
    except Exception as e:
        print(f"Trial failed: {e}")
        mean_reward = -float('inf')
    
    del model
    env.close()
    return mean_reward

# Run the optimization
def optimize_hyperparams(env_name, n_trials=50):
    
    study = optuna.create_study(
        direction='maximize',
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner()
    )
    
    study.optimize(objective, n_trials=n_trials, n_jobs=4) 
    
    print("Best trial:")
    trial = study.best_trial
    print(f"Value: {trial.value}")
    print("Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Save best parameters
    with open(f"best_params_sac_{env_name}.txt", "w") as f:
        f.write(str(trial.params))
    
    # Visualizations
    fig1 = plot_optimization_history(study)
    fig1.show()
    fig1.write_image(f"sac_{env_name}_optimization_history.png")
    
    fig2 = plot_param_importances(study)
    fig2.show()
    fig2.write_image(f"sac_{env_name}_param_importances.png")
    
    return study

if __name__ == "__main__":
    # Select environment ('pendulum' or 'mountaincar')
    env_keys = ['pendulum']
    for i, key in enumerate(env_keys):
        global env_name
        env_name = ENV_NAMES[key]
        # Create directories
        os.makedirs(f"./logs/sac_{env_name}", exist_ok=True)
        
        # Run hyperparameter optimization
        print(f"Optimizing SAC for {env_name}...")
        study = optimize_hyperparams(env_name, n_trials=5)
        print("Optimization completed.")

[I 2025-05-06 10:03:11,144] A new study created in memory with name: no-name-9deb915d-3f6f-4fa3-bf3c-d2a9b8cf9867


Optimizing TD3 for Pendulum-v1...
Environment: Pendulum-v1
Environment: Pendulum-v1
Environment: Pendulum-v1
Environment: Pendulum-v1
Eval num_timesteps=10000, mean_reward=-88.85 +/- 44.61
New best mean reward: -88.85 - Saving model
Eval num_timesteps=10000, mean_reward=-95.37 +/- 40.75
New best mean reward: -95.37 - Saving model
Eval num_timesteps=20000, mean_reward=-87.46 +/- 44.28
New best mean reward: -87.46 - Saving model
Eval num_timesteps=20000, mean_reward=-83.15 +/- 41.75
New best mean reward: -83.15 - Saving model
Eval num_timesteps=30000, mean_reward=-87.33 +/- 43.94
New best mean reward: -87.33 - Saving model


[I 2025-05-06 11:13:00,766] Trial 2 finished with value: -2.6438817977905273 and parameters: {'learning_rate': 0.002039513527229315, 'batch_size': 128, 'buffer_size': 10000, 'learning_starts': 5000, 'train_freq': 2, 'gradient_steps': 4, 'gamma': 0.9136305378395405, 'tau': 0.01414503777558778, 'net_arch': [400, 300], 'noise_type': 'normal', 'noise_std': 0.40907046476778053}. Best is trial 2 with value: -2.6438817977905273.


Environment: Pendulum-v1
Eval num_timesteps=30000, mean_reward=-81.67 +/- 41.57
New best mean reward: -81.67 - Saving model


[I 2025-05-06 11:15:58,703] Trial 0 finished with value: -2.6438796520233154 and parameters: {'learning_rate': 0.005415860094845779, 'batch_size': 64, 'buffer_size': 1000000, 'learning_starts': 500, 'train_freq': 2, 'gradient_steps': 4, 'gamma': 0.9798185428981724, 'tau': 0.014798043341036942, 'net_arch': [256, 256], 'noise_type': 'normal', 'noise_std': 0.6593995681697091}. Best is trial 0 with value: -2.6438796520233154.


Eval num_timesteps=10000, mean_reward=-92.04 +/- 51.48
New best mean reward: -92.04 - Saving model
Eval num_timesteps=10000, mean_reward=-90.50 +/- 51.27
New best mean reward: -90.50 - Saving model
Eval num_timesteps=20000, mean_reward=-82.91 +/- 42.01
New best mean reward: -82.91 - Saving model
Eval num_timesteps=30000, mean_reward=-83.43 +/- 42.17


[I 2025-05-06 11:26:32,773] Trial 4 finished with value: -2.6438519954681396 and parameters: {'learning_rate': 0.0015011425163337368, 'batch_size': 64, 'buffer_size': 1000, 'learning_starts': 1000, 'train_freq': 10, 'gradient_steps': 4, 'gamma': 0.9377957909309438, 'tau': 0.01146343586128969, 'net_arch': [400, 300], 'noise_type': 'ou', 'noise_std': 0.2478839960493613}. Best is trial 4 with value: -2.6438519954681396.


Eval num_timesteps=10000, mean_reward=-94.38 +/- 44.08
New best mean reward: -94.38 - Saving model
Eval num_timesteps=20000, mean_reward=-81.81 +/- 41.30
New best mean reward: -81.81 - Saving model
Eval num_timesteps=20000, mean_reward=-88.70 +/- 43.83
New best mean reward: -88.70 - Saving model
Eval num_timesteps=30000, mean_reward=-82.18 +/- 41.52


[I 2025-05-06 12:21:01,959] Trial 1 finished with value: -2.6434738636016846 and parameters: {'learning_rate': 0.000826917816406134, 'batch_size': 128, 'buffer_size': 10000000, 'learning_starts': 1000, 'train_freq': 1, 'gradient_steps': 7, 'gamma': 0.9819490449687415, 'tau': 0.010093815997082409, 'net_arch': [256, 256], 'noise_type': 'normal', 'noise_std': 0.4909612653800597}. Best is trial 1 with value: -2.6434738636016846.
[I 2025-05-06 12:29:08,403] Trial 3 finished with value: -2.643876791000366 and parameters: {'learning_rate': 0.00010218979577825981, 'batch_size': 256, 'buffer_size': 10000, 'learning_starts': 500, 'train_freq': 1, 'gradient_steps': 10, 'gamma': 0.9046742020585959, 'tau': 0.0021434351417135744, 'net_arch': [64, 64], 'noise_type': 'normal', 'noise_std': 0.4610734653203645}. Best is trial 1 with value: -2.6434738636016846.


Eval num_timesteps=30000, mean_reward=-92.96 +/- 43.38
Best trial:
Value: -2.6434738636016846
Params:
    learning_rate: 0.000826917816406134
    batch_size: 128
    buffer_size: 10000000
    learning_starts: 1000
    train_freq: 1
    gradient_steps: 7
    gamma: 0.9819490449687415
    tau: 0.010093815997082409
    net_arch: [256, 256]
    noise_type: normal
    noise_std: 0.4909612653800597


Optimization completed.


In [5]:
algo = "SAC"

In [6]:
def plot_reward_over_steps(study, algo):
    """
    Plot reward over optimization steps
    """
    fig = plot_optimization_history(study)
    fig.update_layout(title=f'{algo} Optimization History')
    fig.show()

In [7]:
import json

# Load the best hyperparameters

best_hyperparams = study.best_trial.params

# Create the environment
env = DummyVecEnv([lambda: Monitor(gym.make("Pendulum-v1"))])

# Add noise configuration
noise_type = best_hyperparams.pop('noise_type')
noise_std = best_hyperparams.pop('noise_std')
if noise_type == 'normal':
    action_noise = NormalActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1))
else:
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1))

# Train the model
policy_kwargs = {"net_arch": best_hyperparams.pop('net_arch')}
best_model = SAC("MlpPolicy", env, action_noise=action_noise, policy_kwargs=policy_kwargs, **best_hyperparams, verbose=1)
best_model.learn(total_timesteps=100000)

# Save the model
best_model.save(f"./logs/{algo}/best_model.zip")
print("Best model trained and saved.")

Using cpu device
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.09e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 3265      |
|    time_elapsed    | 0         |
|    total_timesteps | 800       |
----------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -949     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 21       |
|    time_elapsed    | 75       |
|    total_timesteps | 1600     |
| train/             |          |
|    actor_loss      | 83.6     |
|    critic_loss     | 2.25     |
|    ent_coef        | 0.256    |
|    ent_coef_loss   | -0.00648 |
|    learning_rate   | 0.000827 |
|    n_updates       | 4193     |
---------------------------------
---------------------------------
| rollout/           

In [12]:
import gymnasium as gym
from stable_baselines3 import SAC

# 1. Load the best model
algo = 'SAC'  # Make sure this matches your algorithm name
best_model = SAC.load(f"./logs/{algo}/best_model.zip")

# 2. Create the environment
env = gym.make("Pendulum-v1", render_mode="human")  # Add render_mode here

# 3. Run the model
try:
    obs, _ = env.reset()  # New Gymnasium API returns (obs, info)
    done = False
    total_reward = 0
    
    while not done:
        action, _ = best_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = env.step(action)  # 5 returns in Gymnasium
        done = terminated or truncated
        total_reward += reward
        
    print(f"Total reward: {total_reward:.2f}")

finally:
    # 4. Clean up
    env.close()
    

Total reward: -231.86


In [14]:
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

def make_env(env_id, render_mode=None):
    def _init():
        env = gym.make(env_id, render_mode=render_mode)
        env = Monitor(env)
        return env
    return _init

# Create environment
env = DummyVecEnv([make_env("Pendulum-v1", render_mode="human")])

try:
    # Option 2: Load without verification then set env
    model = SAC.load(f"./logs/{algo}/best_model.zip")
    model.set_env(env)
    
    # Run the model
    obs = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
        
    print(f"Total reward: {total_reward}")

finally:
    env.close()

Total reward: [-129.43578]
