In [1]:
!apt install swig cmake ffmpeg xvfb python3-opengl
!pip install stable-baselines3==2.0.0a5 gymnasium[box2d] huggingface_sb3 pyvirtualdisplay imageio[ffmpeg]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-opengl is already the newest version (3.1.5+dfsg-1).
swig is already the newest version (4.0.2-1ubuntu1).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
xvfb is already the newest version (2:21.1.4-2ubuntu1.7~22.04.8).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7e7ab4876bc0>

In [2]:
import gymnasium as gym
env = gym.make("BipedalWalker-v3", hardcore=True)
env.reset()

(array([ 2.7475406e-03, -2.0650282e-06,  1.6065684e-04, -1.5999984e-02,
         9.2122972e-02, -2.1201676e-04,  8.6017501e-01,  1.5599747e-03,
         1.0000000e+00,  3.2527719e-02, -2.1199802e-04,  8.5372543e-01,
         1.4993631e-04,  1.0000000e+00,  4.4081393e-01,  4.4582003e-01,
         4.6142268e-01,  4.8955008e-01,  5.3410268e-01,  6.0246092e-01,
         7.0914876e-01,  8.8593167e-01,  1.0000000e+00,  1.0000000e+00],
       dtype=float32),
 {})

In [3]:
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Shape (24,)
Sample observation [-2.7890325  -0.36153117 -3.5301142  -0.3197522   2.9382448   1.3352026
 -1.7200413  -1.4780458   2.2080905  -1.9223825   1.6878897  -0.856083
 -3.0281463   0.09583899 -0.48850304  0.20496143  0.00905617 -0.13050555
 -0.09322756  0.869291   -0.29302204  0.12171899 -0.79148537  0.7153006 ]


In [4]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.shape)
print("Action Space Sample", env.action_space.sample())


 _____ACTION SPACE_____ 

Action Space Shape (4,)
Action Space Sample [ 0.33917123 -0.9469362   0.5420178   0.23178801]


In [5]:
from stable_baselines3.common.env_util import make_vec_env
env = make_vec_env('BipedalWalker-v3', n_envs=1)

In [6]:
from stable_baselines3 import DDPG  # Change this line
model = DDPG(  # Change this line
    policy = 'MlpPolicy',
    env = env,
    learning_rate=1e-3,  # Example hyperparameters for DDPG
    buffer_size=100000,
    batch_size=64,
    learning_starts=10000,
    tau=0.005,
    gamma=0.99,
    verbose=1
)

  and should_run_async(code)


Using cuda device


In [7]:
from wasabi import Printer
import numpy as np
from stable_baselines3.common.base_class import BaseAlgorithm
from pathlib import Path
import tempfile
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import (
    DummyVecEnv,
    VecEnv,
    VecVideoRecorder,
)

In [8]:
msg = Printer()

In [9]:
def generate_replay(
    model: BaseAlgorithm,
    eval_env: VecEnv,
    video_length: int,
    is_deterministic: bool,
    local_path: Path,
):
    """
    Generate a replay video of the agent
    :param model: trained model
    :param eval_env: environment used to evaluate the agent
    :param video_length: length of the video (in timesteps)
    :param is_deterministic: use deterministic or stochastic actions
    :param local_path: path of the local repository
    """
    # This is another temporary directory for video outputs
    # SB3 created a -step-0-to-... meta files as well as other
    # artifacts which we don't want in the repo.
    with tempfile.TemporaryDirectory() as tmpdirname:
        # Step 1: Create the VecVideoRecorder
        env = VecVideoRecorder(
            eval_env,
            tmpdirname,
            record_video_trigger=lambda x: x == 0,
            video_length=video_length,
            name_prefix="",
        )

        obs = env.reset()
        lstm_states = None
        episode_starts = np.ones((env.num_envs,), dtype=bool)

        try:
            for _ in range(video_length):
                action, lstm_states = model.predict(
                    obs,
                    state=lstm_states,
                    episode_start=episode_starts,
                    deterministic=is_deterministic,
                )
                obs, _, episode_starts, _ = env.step(action)

            # Save the video
            env.close()

            # Convert the video with x264 codec
            inp = env.video_recorder.path
            out = local_path
            os.system(f"ffmpeg -y -i {inp} -vcodec h264 {out}".format(inp, out))
            print(f"Video saved to: {out}")
        except KeyboardInterrupt:
            pass
        except Exception as e:
            msg.fail(str(e))
            # Add a message for video
            msg.fail(
                "We are unable to generate a replay of your agent"
            )

In [10]:
model.learn(total_timesteps=580000)

# Save the model
model_name = "ddpg-BipedalWalker-v3"  # Adjust model name if needed
model.save(model_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 837      |
|    ep_rew_mean     | -96.2    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 2327     |
|    time_elapsed    | 1        |
|    total_timesteps | 3347     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 839      |
|    ep_rew_mean     | -97.1    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 2299     |
|    time_elapsed    | 2        |
|    total_timesteps | 6714     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 846      |
|    ep_rew_mean     | -96      |
| time/              |          |
|    episodes        | 12       |
|    fps             | 1549     |
|    time_elapsed    | 6        |
|    total_timesteps | 10148    |
| train/      

In [11]:
import os
video_dir = "."
video_name = "replay.mp4"
env_id = "BipedalWalker-v3"

generate_replay(
    model = model,
    eval_env=DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))]),
    video_length=1000,
    is_deterministic=True,
    local_path=os.path.join(video_dir, video_name)
)

Saving video to /tmp/tmpqkrtilmu/-step-0-to-step-1000.mp4
Moviepy - Building video /tmp/tmpqkrtilmu/-step-0-to-step-1000.mp4.
Moviepy - Writing video /tmp/tmpqkrtilmu/-step-0-to-step-1000.mp4





Moviepy - Done !
Moviepy - video ready /tmp/tmpqkrtilmu/-step-0-to-step-1000.mp4
Video saved to: ./replay.mp4


In [12]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open('replay.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=600 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

  and should_run_async(code)


In [13]:
from stable_baselines3.common.evaluation import evaluate_policy

eval_env = Monitor(gym.make("BipedalWalker-v3"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=-101.28 +/- 6.132709528811361
