---
# Bonus Part 🔥 Reinforcement Learning with a framework.
---

So far, we implemented two widely used Reinforcement Learning algorithms by scratch using jax and haiku.
In this tutorial, we will provide a more convenient way to train RL agents on various tasks, using a popular framework.
We will be using stable-baselines3, a set of reliable implementations of reinforcement learning algorithms in PyTorch.

In [None]:
# Installation
!apt-get update && apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install "stable-baselines3[extra]>=2.0.0a4"

In [None]:
# Imports
import warnings
warnings.filterwarnings('ignore')
import gymnasium as gym
import numpy as np
import os
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.ppo import MlpPolicy, CnnPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

import torchvision
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import torch



# Some helper functions for visualization

In [3]:
# Set up fake display; otherwise rendering will fail
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [4]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay


def show_videos(video_path="", prefix=""):
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [5]:
def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(rom, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

# Create an Atari environment using Gym, and initiate our agent.
Stable baselines provides several RL algorithms for experimenting.
We will use PPO in our first experiment, but feel free to uncomment another algorithm and observe the differences between them.

In [6]:
rom = 'CartPole-v1'
env = gym.make(rom, render_mode="rgb_array")
model = PPO('MlpPolicy',
            env,
            learning_rate=0.0003,
            n_steps=2048,
            batch_size=64,
            n_epochs=10,
            gamma=0.99,
            gae_lambda=0.95,
            clip_range=0.2,
            clip_range_vf=None,
            normalize_advantage=True,
            ent_coef=0.0,
            vf_coef=0.5,
            max_grad_norm=0.5,
            use_sde=False,
            sde_sample_freq=-1,
            target_kl=None,
            stats_window_size=100,
            tensorboard_log=None,
            policy_kwargs=None,
            verbose=0,
            seed=None,
            device='auto',
            _init_setup_model=True)
#model = A2C(policy='MlpPolicy', env=env, verbose=2, batch_size=256, clip_range=0.1, ent_coef=0.1)
#model = DQN(policy='CnnPolicy', env=env, verbose=2, batch_size=256, policy_kwargs=policy_kwargs)

Print the model's architecture.

In [None]:
model.policy

You can easily change the model's architecture. Feel free to experiment with different architectures and observe the performance!

In [8]:
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=dict(pi=[128, 128], vf=[128,128]))

# model = PPO(policy='MlpPolicy', env=env, verbose=2, batch_size=256, clip_range=0.1, ent_coef=0.1, policy_kwargs=policy_kwargs)
# print(model.policy)


In [None]:
# Use a separate environement for evaluation
eval_env = gym.make(rom, render_mode="rgb_array")

# Let's see the mean reward of a random agent.
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [10]:
# Train the agent for 50000 steps
model.learn(total_timesteps=50000)

<stable_baselines3.ppo.ppo.PPO at 0x7bdf9c257430>

In [11]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:500.00 +/- 0.00


Visualize the trained agent in action!

In [12]:
record_video(rom, model, video_length=500, prefix="ppo")

Saving video to /content/videos/ppo-step-0-to-step-500.mp4
Moviepy - Building video /content/videos/ppo-step-0-to-step-500.mp4.
Moviepy - Writing video /content/videos/ppo-step-0-to-step-500.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/ppo-step-0-to-step-500.mp4


In [13]:
show_videos("videos", prefix="ppo")

You can also specify more complex architectures. Here is an example on how to use a pretrained ResNet as the feature extractor:
(Note that CnnPolicy is used for image input by default)

In [14]:
fe = torchvision.models.resnet18(pretrained=True, progress=True)
fe = nn.Sequential(*list(fe.children())[:-1])

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        self.cnn = fe
        self.linear = nn.Sequential(nn.Linear(512, 128), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations).squeeze().unsqueeze(0))

In [15]:
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
)

Feel free to experiment with different algorithms and hyperparameters, as well as various environments provided by Atari.