In [4]:
#pip install stable-baselines3[extra]
import gym
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
import os
import time

In [5]:
# Saving logs to visulise in Tensorboard, saving models
models_dir = f"models/Mountain-{time.time()}"
logdir = f"logs/Mountain-{time.time()}"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
if not os.path.exists(logdir):
    os.makedirs(logdir)

In [6]:
# Parallel environments
env = make_vec_env("MountainCarContinuous-v0", n_envs=1)

In [7]:
# The learning agent and hyperparameters
model = PPO(
    policy=MlpPolicy,
    env=env,
    seed=0,
    batch_size=256,
    ent_coef=0.00429,
    learning_rate=7.77e-05,
    n_epochs=10,
    n_steps=8,
    gae_lambda=0.9,
    gamma=0.9999,
    clip_range=0.1,
    max_grad_norm =5,
    vf_coef=0.19,
    use_sde=True,
    policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
    verbose=1,
    tensorboard_log=logdir
    )

Using cuda device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)


In [8]:
#Training and saving models along the way
TIMESTEPS = 20000
for i in range(10): 
    model.learn(total_timesteps=TIMESTEPS,reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

Logging to logs/Mountain-1679298863.798316\PPO_0
--------------------------
| time/              |   |
|    fps             | 2 |
|    iterations      | 1 |
|    time_elapsed    | 3 |
|    total_timesteps | 8 |
--------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 4            |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 16           |
| train/                  |              |
|    approx_kl            | 0.0011566132 |
|    clip_fraction        | 0.1          |
|    clip_range           | 0.1          |
|    entropy_loss         | 1.3          |
|    explained_variance   | -0.138       |
|    learning_rate        | 7.77e-05     |
|    loss                 | 0.00433      |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000291    |
|    std                  | 0.0373       |
|    value_loss           | 3.5

In [None]:
# Check model performance
# load the best model you observed from tensorboard - the one reach the goal/ obtaining highest return
models_dir = "models/Mountain-1653282767.3143597"
model_path = f"{models_dir}/80000"
best_model = PPO.load(model_path, env=env)
obs = env.reset()
while True:
    action, _states = best_model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    # env.render()  use Python IDE to check, I havn't figure out how to render in Notebook