In [1]:
#Import modules 
import os 
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy 
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import time 

In [2]:
#Create environment
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [3]:
#Render an environment and take random actions for every number of episodes
episodes = 5
for episode in range (1, episodes +1):
    state = env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward 
        
    print('Episode:{} Score:{}'.format(episode, score))


Episode:1 Score:15.0
Episode:2 Score:39.0
Episode:3 Score:40.0
Episode:4 Score:16.0
Episode:5 Score:15.0


In [4]:
#Close environment 
env.close()

In [5]:
#Create log path for training logs 
log_path = os.path.join('Training', 'Logs')

In [6]:
#Make environment, learning environemnt
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [7]:
#learn for 20000s
model.learn(total_timesteps=2000)

Logging to Training\Logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 183  |
|    iterations      | 1    |
|    time_elapsed    | 11   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 231         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007437478 |
|    clip_fraction        | 0.0741      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00115     |
|    learning_rate        | 0.0003      |
|    loss                 | 8.82        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0118     |
|    value_loss           | 56.5        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x23c933d2700>

In [8]:
#Create path for model saving
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Cartpole_1')

In [9]:
#Save model at desired path
model.save(PPO_path)

In [10]:
#bring back model from desired path
model = PPO.load(PPO_path, env=env)

In [11]:
#if it has a score of 200, it is solved 
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [12]:
env.close()

In [13]:
env = gym.make(environment_name)

In [14]:
#Render an environment and take actions based on our model and observations, for every number of episodes
episodes = 5
for episode in range (1, episodes +1):
    obs = env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward 
        
    print('Episode:{} Score:{}'.format(episode, score))


Episode:1 Score:200.0
Episode:2 Score:200.0
Episode:3 Score:200.0
Episode:4 Score:200.0
Episode:5 Score:200.0


In [15]:
obs = env.reset()

In [16]:
action, _ = model.predict(obs)

In [17]:
env.step(action)

(array([ 0.00060077,  0.21491664, -0.04879039, -0.32646572]), 1.0, False, {})

In [18]:
path = os.path.join(log_path, 'PPO_2')


In [19]:
print(log_path)
print(path)

Training\Logs
Training\Logs\PPO_2
