In [1]:
from IPython.display import clear_output

## Set up a virtual display

Mujoco rendering requires a display(even if we want to just generate rgb_array frames). A colab notebook does not have a display. Therefore we need to account for that by creating a virtual display

In [2]:
%%capture
!apt install -y python3-opengl
!apt install -y ffmpeg
!apt install -y xvfb
!pip3 install pyvirtualdisplay

clear_output()

In [3]:
from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7c5786441780>

In [4]:
%pip install gymnasium[mujoco] stable-baselines3

clear_output()

In [None]:
# %pip install numpy matplotlib

# Content

In this demo we will use the Soft Actor Critic algorithm to solve the **"Pusher"** Environment. We will use Stable baselines3's implementation of SAC.

In the pusher environment, a robotic arm needs to learn to push an object on the table from it's starting position towards a goal position. More info can be found [here](https://gymnasium.farama.org/environments/mujoco/pusher/)

![Pusher Image](https://gymnasium.farama.org/_images/pusher.gif)

In [5]:
import numpy as np

import gymnasium as gym

from stable_baselines3 import PPO, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.noise import NormalActionNoise

from IPython.display import clear_output

import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

In [6]:
def frames_to_video(frames, fps=24):
    fig = plt.figure(figsize=(frames[0].shape[1] / 100, frames[0].shape[0] / 100), dpi=100)
    ax = plt.axes()
    ax.set_axis_off()

    if len(frames[0].shape) == 2:  # Grayscale image
        im = ax.imshow(frames[0], cmap='gray')
    else:  # Color image
        im = ax.imshow(frames[0])

    def init():
        if len(frames[0].shape) == 2:
            im.set_data(frames[0], cmap='gray')
        else:
            im.set_data(frames[0])
        return im,

    def update(frame):
        if len(frames[frame].shape) == 2:
            im.set_data(frames[frame], cmap='gray')
        else:
            im.set_data(frames[frame])
        return im,

    interval = 1000 / fps
    anim = FuncAnimation(fig, update, frames=len(frames), init_func=init, blit=True, interval=interval)
    plt.close()
    return HTML(anim.to_html5_video())

  and should_run_async(code)


## Creating the environment

We'll use vectorized environments for faster training

In [7]:
def make_env(render_mode=None):
    return gym.make('Pusher-v4', render_mode=render_mode)

In [8]:
# Create the environment
env = DummyVecEnv([make_env for _ in range(4)])  # adjust accoring to available ram
num_actions = env.action_space.shape[0]

In [9]:
env.action_space

Box(-2.0, 2.0, (7,), float32)

In [10]:
env.observation_space

Box(-inf, inf, (23,), float64)

## Training the model

In [11]:
# Adding some noise for better exploration

noise_mean = np.array([0.0] * num_actions)
noise_std = np.array([0.1] * num_actions)

In [12]:
model = SAC("MlpPolicy",
            env,
            verbose=0,
            action_noise=NormalActionNoise(noise_mean, noise_std),)

In [None]:
clear_output()
model.learn(total_timesteps=int(1e6), progress_bar=True)

## Testing the trained model's performance

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f}")

In [15]:
t_env = DummyVecEnv([lambda: make_env(render_mode="rgb_array")])
state = t_env.reset()
frames = []

while True:
    action, _ = model.predict(state)
    state_next, r, done, info = t_env.step(action)
    frames.append(t_env.render())
    state = state_next
    if done:
        break

t_env.close()

In [None]:
frames_to_video(frames)