Installing dependencies

In [2]:
!pip install stable-baselines3[extra]

Defaulting to user installation because normal site-packages is not writeable


Setting up enviroment

In [3]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [4]:
enviroment_name = "CartPole-v0"
env = gym.make(enviroment_name)

In [5]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print("Episode {} Score {}".format(episode, score))
env.close()

Episode 1 Score 23.0
Episode 2 Score 34.0
Episode 3 Score 29.0
Episode 4 Score 29.0
Episode 5 Score 17.0


Train RL model

In [6]:
tf_log_path = os.path.join('Traning', 'Logs')

In [7]:
tf_log_path

'Traning/Logs'

In [8]:
env = gym.make(enviroment_name) # create gym enviroment
env = DummyVecEnv([lambda: env])    # wrap env into DummyVec
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=tf_log_path)

Using cpu device


In [11]:
model.learn(total_timesteps=20000)

Logging to Traning/Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 1922 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1292        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009577846 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00446    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.45        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0155     |
|    value_loss           | 50.2        |
-----------------------------------------
----

<stable_baselines3.ppo.ppo.PPO at 0x7f91444ec280>

Save na Reload model

In [12]:
PPO_path = os.path.join('Traning', "Saved_models", "PPO_Model_Cartpole")

In [14]:
model.save(PPO_path)

In [15]:
del model

In [16]:
model = PPO.load(PPO_path)

Evaluation

In [18]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

Test model

In [47]:
episodes = 5
for episode in range(1, episodes + 1):
    observation = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _states = model.predict(observation)     # Using model here
        observation, reward, done, info = env.step(action)
        score += reward
    print("Episode {} Score {}".format(episode, score))
env.close()

Episode 1 Score [12.]
Episode 2 Score [10.]
Episode 3 Score [10.]
Episode 4 Score [10.]
Episode 5 Score [10.]


In [None]:
obs = env.reset()


Callback on the traning stage

In [31]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [33]:
save_path = os.path.join("Traning", "Saved_models")

In [34]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [36]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tf_log_path)
model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device
Logging to Traning/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 1774 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1242        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010501511 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -0.0052     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.02        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0204     |
|    value_loss           | 48.5        |
-----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f9040293130>

Changing Policies

In [39]:
net_architecture = [dict(pi=[128,128,128,128], # custom actor new neural network 4 layers 128 units in each layer
                        vf=[128,128,128,128])] # value function 4 layers with 128 units

In [40]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tf_log_path, policy_kwargs={'net_arch':net_architecture})


Using cpu device




In [42]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Traning/Logs/PPO_5
-----------------------------
| time/              |      |
|    fps             | 1203 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 849          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0058346135 |
|    clip_fraction        | 0.0717       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.508       |
|    explained_variance   | 0.158        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.46         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00287     |
|    value_loss           | 30.7         |
-----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f904029af80>

Using alternate algorithm

In [43]:
from stable_baselines3 import DQN

In [48]:
model = DQN("MlpPolicy", env, verbose=1, tensorboard_log=tf_log_path)
model.learn(total_timesteps=20000)

Using cpu device
Logging to Traning/Logs/DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.965    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6288     |
|    time_elapsed     | 0        |
|    total_timesteps  | 73       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.911    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7824     |
|    time_elapsed     | 0        |
|    total_timesteps  | 188      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.846    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 9086     |
|    time_elapsed     | 0        |
|    total_timesteps  | 324      |
----------------------------------
--------

<stable_baselines3.dqn.dqn.DQN at 0x7f9022772320>