# 1. Import dependencies

In [4]:
#import all the required parts for running the program
import os
import gym
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [5]:
#create the environment name as well as the type of environment
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

# 2. Load Env

In [7]:
# example env that runs for 5 episodes doing random actions 
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:43.0
Episode:2 Score:10.0
Episode:3 Score:13.0
Episode:4 Score:15.0
Episode:5 Score:15.0


In [8]:
episodes = 5
for episode in range(1, episodes+1):
    print(episode)

1
2
3
4
5


# 3. Train an RL Model

In [28]:
#building directory
log_path =  os.path.join('Training', 'Logs')

In [7]:
# recreation of the environment same as the initialization
env = gym.make(environment_name) 
# wrapper for the environment, creates lambda function in the env, like a wrapper for non vectorized env
env = DummyVecEnv([lambda: env]) 
#defining the model which defines the agent, passing through 2 args and 2 kwargs
#mlppolicy = multilayer perceptron policy. using NN with standard NN units
# tensorboard log is where it saves
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [35]:
# here is number of training times and actual learning program. 
model.learn(total_timesteps = 20000)

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 1736 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1124         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0032464939 |
|    clip_fraction        | 0.0312       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.557       |
|    explained_variance   | 0.491        |
|    learning_rate        | 0.0003       |
|    loss                 | 58.8         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00513     |
|    value_loss           | 101          |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x16c8c939a60>

# 4. Save and Reload Model

In [8]:
# this is the save location for the training above
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [38]:
#actually saves the model
model.save(PPO_Path)

In [79]:
del model

In [9]:
# actually loads the model
model = PPO.load(PPO_Path, env=env)

# 5. Eval

In [43]:
#ep_len_mean == how long a episode lasts before done
#ep_rew_mean == average reward agent accumlated per episode

In [81]:
#this evaluates how well the agent performed over 10 episodes
evaluate_policy(model, env, n_eval_episodes=10, render=True)
# (x, y) x == average reward, y == deviation



KeyboardInterrupt: 

In [82]:
env.close()

# 6. Test
taking observations and passing it to the agent to make its own choices.

In [83]:
action, _ = model.predict(obs)

In [84]:
# change action, 
episodes = 5
for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) # now using the model here
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [36]:
env.reset()

array([ 0.03675869,  0.01465941,  0.01882426, -0.02708192], dtype=float32)

In [58]:
obs = env.reset()


In [60]:
#returns a action
action, _ = model.predict(obs)

In [61]:
env.action_space.sample()

0

In [71]:
env.step(action)
# reward is 1.0



(array([-0.19100748, -1.986649  ,  0.3022123 ,  3.2176414 ], dtype=float32),
 0.0,
 True,
 {})

# 7. Viewing Logs in Tensorboard

In [85]:
training_log_path = os.path.join(log_path, 'PPO_2')

In [86]:
training_log_path

'Training\\Logs\\PPO_2'

In [None]:
# using ! is a magic command, can use command line command.
# 
!tensorboard --logdir={training_log_path}

In [1]:
# to improve training either train longer or hyperparameter tuning.

# Adding a callback to the training Stage 

In [10]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [13]:
save_path = os.path.join('Training', 'Saved Models')

In [19]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=100, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [20]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [21]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 557  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 656         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008876718 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0126     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.75        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0163     |
|    value_loss           | 53.9        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 200         |
|    mean_reward          | 200         |
| time/                   |             |
|    total timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.015406858 |
|    clip_fraction        | 0.0981      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.61       |
|    explained_variance   | 0.363       |
|    learning_rate        | 0.0003      |
|    loss                 | 27.1        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 58.5        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 200.00  is above the threshold 100


<stable_baselines3.ppo.ppo.PPO at 0x23f89fb38b0>

# Changin Policies

In [23]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [24]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [25]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 1317 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 712         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014145174 |
|    clip_fraction        | 0.216       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.000749    |
|    learning_rate        | 0.0003      |
|    loss                 | 3.33        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0251     |
|    value_loss           | 20.7        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x23f90a89be0>

# 10. Using an Alternate Algorithm

In [26]:
from stable_baselines3 import DQN

In [30]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [31]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration rate | 0.966    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 934      |
|    time_elapsed     | 0        |
|    total timesteps  | 72       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.925    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1181     |
|    time_elapsed     | 0        |
|    total timesteps  | 158      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.873    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 1371     |
|    time_elapsed     | 0        |
|    total timesteps  | 267      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 2064     |
|    time_elapsed     | 1        |
|    total timesteps  | 2404     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 2077     |
|    time_elapsed     | 1        |
|    total timesteps  | 2509     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 2062     |
|    time_elapsed     | 1        |
|    total timesteps  | 2566     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 2317     |
|    time_elapsed     | 2        |
|    total timesteps  | 4974     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 2319     |
|    time_elapsed     | 2        |
|    total timesteps  | 5049     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 2333     |
|    time_elapsed     | 2        |
|    total timesteps  | 5150     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 2279     |
|    time_elapsed     | 3        |
|    total timesteps  | 7217     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 2280     |
|    time_elapsed     | 3        |
|    total timesteps  | 7312     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 2280     |
|    time_elapsed     | 3        |
|    total timesteps  | 7379     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 2297     |
|    time_elapsed     | 4        |
|    total timesteps  | 9634     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 2297     |
|    time_elapsed     | 4        |
|    total timesteps  | 9716     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 2299     |
|    time_elapsed     | 4        |
|    total timesteps  | 9804     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 2262     |
|    time_elapsed     | 5        |
|    total timesteps  | 11891    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 2264     |
|    time_elapsed     | 5        |
|    total timesteps  | 11979    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 2226     |
|    time_elapsed     | 5        |
|    total timesteps  | 12086    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 2278     |
|    time_elapsed     | 6        |
|    total timesteps  | 14444    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 2279     |
|    time_elapsed     | 6        |
|    total timesteps  | 14517    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 2278     |
|    time_elapsed     | 6        |
|    total timesteps  | 14573    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 2283     |
|    time_elapsed     | 7        |
|    total timesteps  | 16852    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 2287     |
|    time_elapsed     | 7        |
|    total timesteps  | 16960    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 2288     |
|    time_elapsed     | 7        |
|    total timesteps  | 17050    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 2293     |
|    time_elapsed     | 8        |
|    total timesteps  | 19173    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 2297     |
|    time_elapsed     | 8        |
|    total timesteps  | 19285    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 2296     |
|    time_elapsed     | 8        |
|    total timesteps  | 19356    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x23f90bfba90>

In [34]:
model.save(path)

NameError: name 'path' is not defined