# Load Dependencies

In [1]:
!pip install stable-baselines3[extra]
import os
import gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy



# Building the Environment

In [2]:
!pip install pyglet
environment_name = "CartPole-v0"
#balance a pole vertically on a moving cart
env = gym.make(environment_name)

episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:22.0
Episode:2 Score:40.0
Episode:3 Score:18.0
Episode:4 Score:16.0
Episode:5 Score:41.0


In [3]:
env.action_space
#0 to move left, 1 to move right

env.observation_space.sample()
#[cart position, cart velocity, pole angle, pole angular velocity]

array([-3.1253953e+00, -2.5262995e+38,  3.2381135e-01,  3.6952284e+37],
      dtype=float32)

# Training

In [4]:
from stable_baselines3 import PPO
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1)

Using cpu device


In [5]:
model.learn(total_timesteps=10000)


-----------------------------
| time/              |      |
|    fps             | 1326 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 911          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0072583435 |
|    clip_fraction        | 0.0725       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.00383     |
|    learning_rate        | 0.0003       |
|    loss                 | 7.02         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0103      |
|    value_loss           | 50.5         |
------------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x2ca9db53790>

# Evaluation

In [6]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(183.4, 22.61503924383064)

In [7]:
env.close()

# Testing

In [8]:
episodes = 5
for episode in range(1, episodes+1):
    observation = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, tr = model.predict(observation)
        observation, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:[65.]
Episode:2 Score:[110.]
Episode:3 Score:[88.]
Episode:4 Score:[200.]
Episode:5 Score:[120.]
