# Stable Baselines

Stable Baselines is a set of high-quality implementations of reinforcement learning algorithms in Python. It is built on top of OpenAI Gym (Gymnasium) and provides a user-friendly interface for training, evaluating, and deploying RL agents. Stable Baselines offers a wide range of popular RL algorithms, including Proximal Policy Optimization (PPO), Deep Q-Networks (DQN), and Trust Region Policy Optimization (TRPO), among others. It aims to provide reliable and stable implementations that are extensively tested and documented, making it easier for researchers and practitioners to work with reinforcement learning.


In [None]:
# Check the version of the installed versions
import gym as gym
import stable_baselines3 as sb3
import torch
import numpy as np

print(f"Gymnasium version: {gym.__version__}")
print(f"Stable Baselines version : {sb3.__version__}")
print(f"Torch version : {torch.__version__}")
print(f"Numpy version : {np.__version__}")

Gymnasium version: 0.25.2
Stable Baselines version : 2.2.1
Torch version : 2.1.0+cu121
Numpy version : 1.23.5


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import files

# Video management imports
import cv2

# Helper functions to save videos and images
def save_video(img_array, path='/content/video/test.mp4'):
    height, width, layers = img_array[0].shape
    size = (width, height)
    out = cv2.VideoWriter(path, cv2.VideoWriter_fourcc(*'mp4v'), 15, size)
    for i in range(len(img_array)):
        bgr_img = cv2.cvtColor(img_array[i], cv2.COLOR_RGB2BGR)
        out.write(bgr_img)
    out.release()
    print('Video saved.')



def save_images(img_array, path='./images'):
    for i, image in enumerate(img_array):
        bgr_img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        cv2.imwrite(path + '/img_' + str(i) + '.jpg', bgr_img)

In [None]:
# Video management imports
import cv2

# Helper functions to save videos and images


def save_video(img_array, path='./video/test.mp4'):
    height, width, layers = img_array[0].shape
    size = (width, height)
    out = cv2.VideoWriter(path, cv2.VideoWriter_fourcc(*'mp4v'), 15, size)
    for i in range(len(img_array)):
        bgr_img = cv2.cvtColor(img_array[i], cv2.COLOR_RGB2BGR)
        out.write(bgr_img)
    out.release()
    print('Video saved.')


def save_images(img_array, path='./images'):
    for i, image in enumerate(img_array):
        bgr_img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        cv2.imwrite(path + '/img_' + str(i) + '.jpg', bgr_img)

## Basic Training

To train an agent we only need an algorithm object an environment and a call to the learn function.

In [None]:
from stable_baselines3 import A2C

env = gym.make("CartPole-v1", render_mode="rgb_array")

model = A2C("MlpPolicy", env, verbose=1, learning_rate=0.001)
model.learn(total_timesteps=10_000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 15.6     |
|    ep_rew_mean        | 15.6     |
| time/                 |          |
|    fps                | 154      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.669   |
|    explained_variance | 0.22     |
|    learning_rate      | 0.001    |
|    n_updates          | 99       |
|    policy_loss        | 2.15     |
|    value_loss         | 9.45     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 19.3     |
|    ep_rew_mean        | 19.3     |
| time/                 |          |
|    fps                | 230      |
|    iterations         | 200      |
|    time_elapsed

<stable_baselines3.a2c.a2c.A2C at 0x7aae2da43340>

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


## Custom network

In [None]:
from stable_baselines3 import PPO

# Custom actor (pi) and value function (vf) networks
# of two layers of size 32 each with Relu activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=dict(pi=[64, 64], vf=[64, 64]))
# Create the agent
model = PPO("MlpPolicy", "CartPole-v1", policy_kwargs=policy_kwargs, verbose=1)
# Retrieve the environment
env = model.get_env()
# Train the agent
model.learn(total_timesteps=30_000)

Stable Baselines also enables to save and reload the model from a file

In [None]:
# Save the agent
# model.save("ppo_cartpole")
# del model
# the policy_kwargs are automatically loaded
# model = PPO.load("ppo_cartpole", env=env)

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


Let's try another environmet Acrobot

In [None]:
# Custom actor (pi) and value function (vf) networks
# of two layers of size 32 each with Relu activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=dict(pi=[64, 64], vf=[64, 64]))
# Create the agent
model = PPO("MlpPolicy", "Acrobot-v1", policy_kwargs=policy_kwargs, verbose=1)
# Retrieve the environment
env = model.get_env()
# Train the agent
model.learn(total_timesteps=30_000)

Let's see the results:

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _, _ = model.select_action(observation)
    observation, reward, done, truncated, info = env.step(action)
    done = done or truncated
    image = env.render()
    img_array.append(image)
env.close()

save_video(imgs, path='video/sb3_ppo_acrobot.mp4')

## Using a custom network with Pytorch

We can train our agent with a custom network. In this case we will create a simple module for an actor-critic network. We need an adapter class *CustomActorCriticPolicy* to adapt our network to the needs of the algorithm and the environment.

In [None]:
from typing import Callable, Dict, List, Optional, Tuple, Type, Union

from stable_baselines3.common.policies import ActorCriticPolicy
from gymnasium import spaces
import torch
from torch import nn

class CustomNetwork(nn.Module):
    """
    Custom network for policy and value function.
    It receives as input the features extracted by the features extractor.

    :param feature_dim: dimension of the features extracted with the features_extractor (e.g. features from a CNN)
    :param last_layer_dim_pi: (int) number of units for the last layer of the policy network
    :param last_layer_dim_vf: (int) number of units for the last layer of the value network
    """

    def __init__(
        self,
        obs_space_size: int,
        last_layer_dim_pi: int = 64,
        last_layer_dim_vf: int = 64,
    ):
        super().__init__()

        # IMPORTANT:
        # Save output dimensions, used to create the distributions
        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf

        # Policy network
        self.policy_net = nn.Sequential(
            nn.Linear(obs_space_size, last_layer_dim_pi), nn.ReLU()
        )
        # Value network
        self.value_net = nn.Sequential(
            nn.Linear(obs_space_size, last_layer_dim_vf), nn.ReLU()
        )

    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        :return: (th.Tensor, th.Tensor) latent_policy, latent_value of the specified network.
            If all layers are shared, then ``latent_policy == latent_value``
        """
        return self.forward_actor(features), self.forward_critic(features)

    def forward_actor(self, features: torch.Tensor) -> torch.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: torch.Tensor) -> torch.Tensor:
        return self.value_net(features)



In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


In [None]:
# This class will be automatically called by our algorithm with the adecuate
# dimensions
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        observation_space: spaces.Space,
        action_space: spaces.Space,
        lr_schedule: Callable[[float], float],
        *args,
        **kwargs,
    ):
        # Disable orthogonal initialization
        kwargs["ortho_init"] = False
        super().__init__(
            observation_space,
            action_space,
            lr_schedule,
            # Pass remaining arguments to base class
            *args,
            **kwargs,
        )


    def _build_mlp_extractor(self) -> None:
        self.mlp_extractor = CustomNetwork(self.features_dim)


model = PPO(CustomActorCriticPolicy, "CartPole-v1", verbose=0)
# Another way to show the progress of the training
model.learn(30_000, progress_bar=True)

# Exercises

1. Select an environment with a continuous action space from Gymnasium (not atari) and try to train it.


In [None]:
# This class will be automatically called by our algorithm with the adecuate
# dimensions
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        observation_space: spaces.Space,
        action_space: spaces.Space,
        lr_schedule: Callable[[float], float],
        *args,
        **kwargs,
    ):
        # Disable orthogonal initialization
        kwargs["ortho_init"] = False
        super().__init__(
            observation_space,
            action_space,
            lr_schedule,
            # Pass remaining arguments to base class
            *args,
            **kwargs,
        )


    def _build_mlp_extractor(self) -> None:
        self.mlp_extractor = CustomNetwork(self.features_dim)


model = PPO(CustomActorCriticPolicy, "MountainCarContinuous-v0", verbose=0)
# Another way to show the progress of the training
model.learn(30_000, progress_bar=True)

Output()

<stable_baselines3.ppo.ppo.PPO at 0x7aad04fd3820>

In [None]:
from stable_baselines3 import PPO
import torch

# Custom actor (pi) and value function (vf) networks
# of two layers of size 64 each with ReLU activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(
    net_arch=dict(pi=[64, 64], vf=[64, 64]), activation_fn=torch.nn.ReLU
)

ent_coef = 0.7
learning_rate = 0.0003  # Experiment with different values

# Create the agent
model = PPO("MlpPolicy", "MountainCarContinuous-v0", policy_kwargs=policy_kwargs, verbose=1,ent_coef = ent_coef, learning_rate=learning_rate)

# Retrieve the environment
env = model.get_env()

# Train the agent
model.learn(total_timesteps=200000)

Using cuda device
Creating environment from the given name 'MountainCarContinuous-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 999      |
|    ep_rew_mean     | -53.5    |
| time/              |          |
|    fps             | 832      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 999          |
|    ep_rew_mean          | -54.4        |
| time/                   |              |
|    fps                  | 647          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0116092395 |
|    clip_fraction      

<stable_baselines3.ppo.ppo.PPO at 0x7aad04fd14b0>

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


## I've been struggling with the code for several days and haven't been able to solve it. Sometimes it does go backward to gain momentum, but it doesn't make it all the way up. And when I increase the entropy, it does go up, but since it doesn't build up momentum, it doesn't reach the top. I understand that the goal is for it to go backward to gain speed and then climb up, but I haven’t managed to find that balance. I’ve tried several architectures, even a custom one, but haven’t gotten decent results. In the current simulation, it goes upward but never moves backward.

In [None]:
from stable_baselines3 import PPO
import torch

# Custom actor (pi) and value function (vf) networks
# of two layers of size 64 each with ReLU activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(
    net_arch=dict(pi=[64, 64], vf=[64, 64]), activation_fn=torch.nn.ReLU
)

ent_coef = 0.8
learning_rate = 0.0002  # Experiment with different values
clip_range = 0.5  # Experiment with different values
# Create the agent
model = PPO("MlpPolicy", "MountainCarContinuous-v0", policy_kwargs=policy_kwargs, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, clip_range=clip_range)

# Retrieve the environment
env = model.get_env()

# Train the agent
model.learn(total_timesteps=200000)


Using cuda device
Creating environment from the given name 'MountainCarContinuous-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 999      |
|    ep_rew_mean     | -53.4    |
| time/              |          |
|    fps             | 849      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 999         |
|    ep_rew_mean          | -53.8       |
| time/                   |             |
|    fps                  | 618         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006054082 |
|    clip_fraction        | 0.00308

<stable_baselines3.ppo.ppo.PPO at 0x7aad04fd0e80>

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


In [None]:
from stable_baselines3 import PPO
import torch

# Custom actor (pi) and value function (vf) networks
# of two layers of size 64 each with ReLU activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(
    net_arch=dict(pi=[64, 64], vf=[64, 64]), activation_fn=torch.nn.ReLU
)

ent_coef = 0.8
learning_rate = 0.0002  # Experiment with different values
clip_range = 0.2  # Experiment with different values
# Create the agent
model = PPO("MlpPolicy", "MountainCarContinuous-v0", policy_kwargs=policy_kwargs, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, clip_range=clip_range)

# Retrieve the environment
env = model.get_env()

# Train the agent
model.learn(total_timesteps=200000)

Using cuda device
Creating environment from the given name 'MountainCarContinuous-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 999      |
|    ep_rew_mean     | -51.4    |
| time/              |          |
|    fps             | 842      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 999         |
|    ep_rew_mean          | -53.3       |
| time/                   |             |
|    fps                  | 645         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004764479 |
|    clip_fraction        | 0.0176 

<stable_baselines3.ppo.ppo.PPO at 0x7aad04b90d00>

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


In [None]:
from stable_baselines3 import PPO
import torch

# Custom actor (pi) and value function (vf) networks
# of two layers of size 64 each with ReLU activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(
    net_arch=dict(pi=[64, 64], vf=[64, 64]), activation_fn=torch.nn.ReLU
)

ent_coef = 0.8
learning_rate = 0.0002  # Experiment with different values
clip_range = 0.8  # Experiment with different values
# Create the agent
model = PPO("MlpPolicy", "MountainCarContinuous-v0", policy_kwargs=policy_kwargs, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, clip_range=clip_range)

# Retrieve the environment
env = model.get_env()

# Train the agent
model.learn(total_timesteps=200000)

Using cuda device
Creating environment from the given name 'MountainCarContinuous-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 999      |
|    ep_rew_mean     | -50.9    |
| time/              |          |
|    fps             | 845      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 999        |
|    ep_rew_mean          | -53        |
| time/                   |            |
|    fps                  | 636        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00519888 |
|    clip_fraction        | 0.000391   |
|    

<stable_baselines3.ppo.ppo.PPO at 0x7aad04b91d20>

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.


In [None]:
from stable_baselines3 import PPO
import torch

# Custom actor (pi) and value function (vf) networks
# of two layers of size 64 each with ReLU activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(
    net_arch=dict(pi=[64, 64], vf=[64, 64]), activation_fn=torch.nn.ReLU
)

ent_coef = 0.8
learning_rate = 0.0002  # Experiment with different values
clip_range = 2  # Experiment with different values
# Create the agent
model = PPO("MlpPolicy", "MountainCarContinuous-v0", policy_kwargs=policy_kwargs, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, clip_range=clip_range)

# Retrieve the environment
env = model.get_env()

# Train the agent
model.learn(total_timesteps=200000)

Using cuda device
Creating environment from the given name 'MountainCarContinuous-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 999      |
|    ep_rew_mean     | -52.7    |
| time/              |          |
|    fps             | 838      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 999          |
|    ep_rew_mean          | -53.7        |
| time/                   |              |
|    fps                  | 620          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0047132466 |
|    clip_fraction      

<stable_baselines3.ppo.ppo.PPO at 0x7aad04b93f40>

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
imgs = []
done = False
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action) # Notice the difference between
    img = vec_env.render("rgb_array")
    imgs.append(img)

env.close()

save_video(imgs, path='')

Video saved.
