In [1]:

# # https://github.com/DLR-RM/stable-baselines3/pull/780
# !pip install gymnasium
# !pip install 'gymnasium[mujoco]'
# !pip install matplotlib
# !pip3 install torch torchvision torchaudio
# !pip install "sb3_contrib>=2.0.0a1" --upgrade

In [2]:
from stable_baselines3 import TD3
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback
import gymnasium as gym

import os

import numpy as np
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self,env_name:str, check_freq: int, log_dir: str, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, f"{timestr}_{env_name}")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print(f"Num timesteps: {self.num_timesteps}")
                    print(
                        f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True

In [3]:

log_dir = "./tmp/gym/"
os.makedirs(log_dir, exist_ok=True)
env_name = "Ant-v4"
env = gym.make(env_name,render_mode='human')
env = Monitor(env, log_dir)


from stable_baselines3 import PPO
from torch import nn
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(env_name=env_name,check_freq=1000, log_dir=log_dir)

# Ant-v3:
#   normalize: true
#   n_envs: 1
#   policy: 'MlpPolicy'
#   n_timesteps: !!float 1e7
#   batch_size: 32
#   n_steps: 512
#   gamma: 0.98
#   learning_rate: 1.90609e-05
#   ent_coef: 4.9646e-07
#   clip_range: 0.1
#   n_epochs: 10
#   gae_lambda: 0.8
#   max_grad_norm: 0.6
#   vf_coef: 0.677239
model = PPO("MlpPolicy", env,batch_size=32,n_steps=512,gamma=0.98,learning_rate=1.90609e-05,ent_coef=4.9646e-07,clip_range=0.1,n_epochs=10,gae_lambda=0.8,max_grad_norm=0.6,vf_coef=0.677239)
model.learn(total_timesteps=1e7,callback=callback)

Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: -14.55
Saving new best model to ./tmp/gym/20230417-224635_Ant-v4.zip
Num timesteps: 2000
Best mean reward: -14.55 - Last mean reward per episode: -198.07
Num timesteps: 3000
Best mean reward: -14.55 - Last mean reward per episode: -120.43
Num timesteps: 4000
Best mean reward: -14.55 - Last mean reward per episode: -98.88
Num timesteps: 5000
Best mean reward: -14.55 - Last mean reward per episode: -86.56
Num timesteps: 6000
Best mean reward: -14.55 - Last mean reward per episode: -104.01
Num timesteps: 7000
Best mean reward: -14.55 - Last mean reward per episode: -112.26
Num timesteps: 8000
Best mean reward: -14.55 - Last mean reward per episode: -103.50
Num timesteps: 9000
Best mean reward: -14.55 - Last mean reward per episode: -113.41
Num timesteps: 10000
Best mean reward: -14.55 - Last mean reward per episode: -119.85
Num timesteps: 11000
Best mean reward: -14.55 - Last mean reward per episode: -111.31
Num ti

: 

: 

: 

: 

: 

: 

: 

: 