In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

### Load the Environment

In [2]:
environment = 'CartPole-v1'
env = gym.make(environment, render_mode="human")

In [3]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset() #Initial set of observations
    done = False 
    score = 0
    
    while not done:
        env.render() #View the graphical representation of environment
        action = env.action_space.sample()#Generate random actions
        # Two actions possible -> push left / right
        
        # env.observation_space
        # Observation_space -> Position, Velocity of Cart ; Pole Angle and Agular Velocity
        # These make 4 info, in other words Box(4)
        
        n_state, reward, done, info , extra_info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:17.0
Episode:2 Score:11.0
Episode:3 Score:38.0
Episode:4 Score:27.0
Episode:5 Score:16.0


In [4]:
#Create/Make the directories
log_path = os.path.join('Training', 'Logs')
log_path

'Training/Logs'

In [5]:
env = gym.make(environment)
#Wrapped the environment inside the DummyVecEnv
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device




In [6]:
#Timesteps depend on the complexity of the environment
model.learn(total_timesteps=200000)

Logging to Training/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 7274 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 4574        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008370907 |
|    clip_fraction        | 0.0875      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00138     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.71        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0136     |
|    value_loss           | 62.2        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x28e69ae10>

### Save and Reload Model

In [7]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [8]:
model.save(PPO_Path)

In [9]:
#Delete the model and reload it
del model

In [10]:
model = PPO.load(PPO_Path, env=env)

In [11]:
model.learn(total_timesteps=200000)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 7383 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 4876         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0012849824 |
|    clip_fraction        | 0.0062       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.25        |
|    explained_variance   | -1.8         |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00113      |
|    n_updates            | 990          |
|    policy_gradient_loss | 0.000553     |
|    value_loss           | 2.9e-06      |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x28e5017d0>

### Evaluation

In [12]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
#On average is reward 500, std_dev = 0



(500.0, 0.0)

In [13]:
#Now the environment should be closed. Unless the window would stay opened till the end.
env.close()

### Test the Model

In [14]:
episodes = 5
for episode in range(1, episodes+1):
    observation = env.reset() #Initial set of observations
    done = False 
    score = 0
    
    while not done:
        env.render() #View the graphical representation of environment
        action, _ = model.predict(observation)#Generate random actions
        # Two actions possible -> push left / right
        
        # env.observation_space
        # Observation_space -> Position, Velocity of Cart ; Pole Angle and Agular Velocity
        # These make 4 info, in other words Box(4)
        
        observation, reward, done, info= env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:[500.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[500.]
Episode:5 Score:[500.]


### Viewing Logs in Tensorboard

In [15]:
training_log_path = os.path.join(log_path, 'PPO_2')

In [16]:
training_log_path

'Training/Logs/PPO_2'

In [17]:
#!tensorboard --logdir=.

### Adding a callback to the training State

In [18]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [19]:
save_path = os.path.join('Training', 'Saved Models')

In [20]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [21]:
model = PPO('MlpPolicy', environment, tensorboard_log=log_path)
#Mlp -> Multi layer perceptron policy

In [22]:
model.learn(total_timesteps=100000, callback=eval_callback)



Eval num_timesteps=10000, episode_reward=380.80 +/- 150.47
Episode length: 380.80 +/- 150.47
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Stopping training because the mean reward 500.00  is above the threshold 500


<stable_baselines3.ppo.ppo.PPO at 0x2d8c52c50>

### Changing Policies

In [28]:
#Changing the architechture
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [29]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [30]:
model.learn(total_timesteps=100000, callback=eval_callback)

Logging to Training/Logs/PPO_8
-----------------------------
| time/              |      |
|    fps             | 5666 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3080        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015560007 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00273    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.83        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0245     |
|    value_loss           | 17.8        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2d9e5c1d0>

### Using an Alternate Algorithm

In [31]:
from stable_baselines3 import DQN

In [32]:
model = DQN('MlpPolicy', environment, tensorboard_log=log_path)


In [33]:
model.learn(total_timesteps=20000)

<stable_baselines3.dqn.dqn.DQN at 0x2d9e58d90>

In [34]:
DQN.load

<bound method BaseAlgorithm.load of <class 'stable_baselines3.dqn.dqn.DQN'>>