# 1. Import dependencies

In [14]:
import gymnasium as gym
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load environment

In [20]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name)
env.reset()

(array([-0.02960015, -0.00293233, -0.03210034, -0.01111461], dtype=float32),
 {})

In [21]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, truncated,  info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))

Episode: 1 Score: 12.0
Episode: 2 Score: 24.0
Episode: 3 Score: 14.0
Episode: 4 Score: 20.0
Episode: 5 Score: 13.0
Episode: 6 Score: 14.0
Episode: 7 Score: 23.0
Episode: 8 Score: 13.0
Episode: 9 Score: 18.0
Episode: 10 Score: 10.0


## Understanding the environment

In [22]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

np.int64(0)

In [23]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

array([ 2.4286697e+00,  2.6898864e+38,  2.4621819e-01, -2.9737029e+37],
      dtype=float32)

# 3. Training a RL Model

In [24]:
log_path = os.path.join('Training', 'Logs') # directories need to exist before training
log_path

'Training/Logs'

In [25]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [27]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 6952 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 4705         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.010147203  |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.685       |
|    explained_variance   | -0.016705036 |
|    learning_rate        | 0.0003       |
|    loss                 | 6.65         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0187      |
|    value_loss           | 47.6         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x151d31430>

# 4. Save and Reload Model

In [28]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Model_CartPole')
model.save(PPO_path)

In [37]:
del model

In [38]:
model.learn(total_timesteps=20000)

NameError: name 'model' is not defined

In [40]:
model = PPO.load(PPO_path, env=env) # load the model from the saved path

# 5. Evaluation

In [44]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(np.float64(500.0), np.float64(0.0))

In [46]:
env.close() # close the environment

# 6. Test

In [51]:
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))

Episode: 1 Score: [500.]
Episode: 2 Score: [500.]
Episode: 3 Score: [268.]
Episode: 4 Score: [481.]
Episode: 5 Score: [500.]
Episode: 6 Score: [500.]
Episode: 7 Score: [469.]
Episode: 8 Score: [500.]
Episode: 9 Score: [419.]
Episode: 10 Score: [500.]


In [52]:
obs = env.reset()

In [55]:
action, _ = model.predict(obs)

In [54]:
env.action_space.sample()

np.int64(1)

In [56]:
env.step(action)

(array([[ 0.03981717, -0.20893186,  0.04524281,  0.3453281 ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

# 7. View Logs in Tensorboard

-> Usually executed from a terminal not inside a notebook

In [57]:
training_log_path = os.path.join(log_path, 'PPO_2')

In [58]:
!tensorboard --logdir={training_log_path}

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.17.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C


# 8. Add a Callback Stage

In [59]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [73]:
save_path = os.path.join('Training', 'Saved_Models', 'PPO_Model_CartPole')

In [74]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)

In [75]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [76]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_7
-----------------------------
| time/              |      |
|    fps             | 7064 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 4970         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.009247511  |
|    clip_fraction        | 0.122        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.685       |
|    explained_variance   | -0.007834554 |
|    learning_rate        | 0.0003       |
|    loss                 | 7.23         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0189      |
|    value_loss           | 51.9         |
----------------------------



<stable_baselines3.ppo.ppo.PPO at 0x16ba01070>

# 9. Changing Policies

In [78]:
new_arch = dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])
model = PPO('MlpPolicy', env, verbose=1, policy_kwargs={'net_arch': new_arch}, tensorboard_log=log_path)

Using cpu device


In [79]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_8
-----------------------------
| time/              |      |
|    fps             | 5321 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 3480          |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.013313074   |
|    clip_fraction        | 0.163         |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.683        |
|    explained_variance   | 0.00024664402 |
|    learning_rate        | 0.0003        |
|    loss                 | 2.61          |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.0204       |
|    value_loss           | 18.4          |
-----------



Eval num_timesteps=10000, episode_reward=385.80 +/- 99.09
Episode length: 385.80 +/- 99.09
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 386         |
|    mean_reward          | 386         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.010932865 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.578      |
|    explained_variance   | 0.4775414   |
|    learning_rate        | 0.0003      |
|    loss                 | 17.3        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0171     |
|    value_loss           | 38          |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 385.80  is above the threshold 300


<stable_baselines3.ppo.ppo.PPO at 0x16ba9f790>

# 10. Use an alternate Algorithm

In [80]:
from stable_baselines3 import DQN

In [81]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [82]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.969    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 12716    |
|    time_elapsed     | 0        |
|    total_timesteps  | 66       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.925    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7214     |
|    time_elapsed     | 0        |
|    total_timesteps  | 157      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.608    |
|    n_updates        | 14       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.891    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 



----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 704      |
|    fps              | 4357     |
|    time_elapsed     | 2        |
|    total_timesteps  | 10811    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0351   |
|    n_updates        | 2677     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 708      |
|    fps              | 4355     |
|    time_elapsed     | 2        |
|    total_timesteps  | 10848    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0289   |
|    n_updates        | 2686     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

<stable_baselines3.dqn.dqn.DQN at 0x16baecc70>