In [3]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

import os

In [4]:
environment_name = 'Breakout-v0'
env = gym.make(environment_name)

In [5]:
env.reset()

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [7]:
env.action_space

Discrete(4)

In [8]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [9]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:1.0
Episode:2 Score:3.0
Episode:3 Score:3.0
Episode:4 Score:0.0
Episode:5 Score:1.0


# 3.Vectorise Env and Train Model

In [10]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [11]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(total_timesteps = 100000)

Logging to Training\Logs\A2C_3
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 468      |
|    ep_rew_mean        | 5.85     |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 100      |
|    time_elapsed       | 32       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.538   |
|    explained_variance | 0.928    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5166     |
|    policy_loss        | -0.00836 |
|    value_loss         | 0.0449   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 468      |
|    ep_rew_mean        | 5.7      |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 200      |
|    time_elapsed       | 63       |
|    total_timesteps    | 4000     |
| train

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 550      |
|    ep_rew_mean        | 7.21     |
| time/                 |          |
|    fps                | 66       |
|    iterations         | 1400     |
|    time_elapsed       | 422      |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.659    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6466     |
|    policy_loss        | -0.0595  |
|    value_loss         | 0.0861   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 524      |
|    ep_rew_mean        | 6.76     |
| time/                 |          |
|    fps                | 66       |
|    iterations         | 1500     |
|    time_elapsed       | 452      |
|    total_timesteps    | 30000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 579      |
|    ep_rew_mean        | 7.75     |
| time/                 |          |
|    fps                | 67       |
|    iterations         | 2800     |
|    time_elapsed       | 824      |
|    total_timesteps    | 56000    |
| train/                |          |
|    entropy_loss       | -0.425   |
|    explained_variance | 0.516    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7866     |
|    policy_loss        | -0.0604  |
|    value_loss         | 0.242    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 588      |
|    ep_rew_mean        | 7.95     |
| time/                 |          |
|    fps                | 67       |
|    iterations         | 2900     |
|    time_elapsed       | 853      |
|    total_timesteps    | 58000    |
| train/                |          |
|

# 4.Save and Retrain model

In [None]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_Breakout_Model')
model.save(a2c_path)

In [15]:
del model

In [16]:
model = A2C.load(a2c_path, env)


Wrapping the env in a VecTransposeImage.


# 5. Eval and Test

In [17]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [19]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(6.5, 3.2326459750489227)

In [20]:
env.close()