In [1]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.5.0-py3-none-any.whl (177 kB)
[K     |████████████████████████████████| 177 kB 8.1 MB/s 
[?25hCollecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 32.9 MB/s 
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting ale-py~=0.7.4
  Downloading ale_py-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 44.2 MB/s 
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gym, AutoROM.accept-rom-license
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.

In [2]:
!apt-get install ffmpeg freeglut3-dev xvfb

Reading package lists... Done
Building dependency tree       
Reading state information... Done
freeglut3-dev is already the newest version (2.8.1-3).
freeglut3-dev set to manually installed.
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'apt autoremove' to remove them.
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 42 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,271 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.10 [784 kB]
Fetched 784 kB in 1s (1,092 kB/s)
Selecting previously unselected package xvfb.
(Reading database ... 155202 files and directories currently installed.)
Preparing to unpack .../xvfb_2%3a1.19.6-1ubuntu4.10_amd64.deb ...
Unpacking xvfb (2:1.19.6-1ubuntu4.10)

In [3]:
import gym
import numpy as np
from stable_baselines3 import PPO

In [4]:
from stable_baselines3.ppo.policies import MlpPolicy

In [5]:
#Create the gym enviroment and instaniate the object
env = gym.make('CartPole-v1')
model = PPO(MlpPolicy, env, verbose=0)

In [6]:
def evaluate(model,num_episodes=100):
  env = model.get_env()
  all_episode_rewards = []
  for i in range(num_episodes):
    episode_rewards = []
    done = False
    obs = env.reset()
    while not done:
      action, _states = model.predict(obs)

      obs, reward, done, info = env.step(action)
      episode_rewards.append(reward)

    all_episode_rewards.append(sum(episode_rewards))

  mean_episode_reward = np.mean(all_episode_rewards)
  print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)
  return mean_episode_reward


In [7]:
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100)

Mean reward: 21.26 Num episodes: 100


In [8]:
from stable_baselines3.common.evaluation import evaluate_policy

In [9]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:9.32 +/- 0.68


In [10]:
# Train the model and evaluate it
model.learn(total_timesteps=10000)


<stable_baselines3.ppo.ppo.PPO at 0x7efc7644ee90>

In [11]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:374.69 +/- 114.00


In [12]:
#Prepare the video recording
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [13]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [14]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [15]:
record_video('CartPole-v1', model, video_length=500, prefix='ppo2-cartpole')

Saving video to /content/videos/ppo2-cartpole-step-0-to-step-500.mp4


In [16]:
show_videos('videos', prefix='ppo2')