### 1. Import Dependencies

In [2]:
import gym 
from stable_baselines3 import A2C # A2C algorithm
from stable_baselines3.common.vec_env import VecFrameStack 
# we are going to train 4 environment at the same time 
# so we are going to use this to vectorize our environment so we can speed up our training
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
# help us work with atari environment
import os


### 2. Test Environment

In [3]:
!python -m atari_py.import_roms .\Roms\ROMS

copying adventure.bin from .\Roms\ROMS\Adventure (1980) (Atari, Warren Robinett) (CX2613, CX2613P) (PAL).bin to c:\Users\hongh\AppData\Local\Programs\Python\Python310\lib\site-packages\atari_py\atari_roms\adventure.bin
copying air_raid.bin from .\Roms\ROMS\Air Raid (1982) (Men-A-Vision) (PAL) ~.bin to c:\Users\hongh\AppData\Local\Programs\Python\Python310\lib\site-packages\atari_py\atari_roms\air_raid.bin
copying alien.bin from .\Roms\ROMS\Alien (1982) (20th Century Fox Video Games, Douglas 'Dallas North' Neubauer) (11006) ~.bin to c:\Users\hongh\AppData\Local\Programs\Python\Python310\lib\site-packages\atari_py\atari_roms\alien.bin
copying amidar.bin from .\Roms\ROMS\Amidar (1982) (Parker Brothers, Ed Temple) (PB5310) ~.bin to c:\Users\hongh\AppData\Local\Programs\Python\Python310\lib\site-packages\atari_py\atari_roms\amidar.bin
copying assault.bin from .\Roms\ROMS\Assault (AKA Sky Alien) (1983) (Bomb - Onbase) (CA281).bin to c:\Users\hongh\AppData\Local\Programs\Python\Python310\lib\

In [4]:
environment_name = 'Breakout-v0'
env = gym.make(environment_name)

In [5]:
env.reset()
# got our observation

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [6]:
env.action_space
# got 4 different action space

Discrete(4)

In [7]:
env.observation_space
# image based model

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

In [8]:
episodes = 5

for episode in range(1,episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render(mode='rgb_array')
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
    


Episode:1 Score:0.0
Episode:2 Score:0.0
Episode:3 Score:2.0
Episode:4 Score:1.0
Episode:5 Score:0.0


In [10]:
env.close()

### 3. Vectorise Environment and Train Model

We are going to do something different than we used to do, vectorize our environment and train 4 different environment at one time

In [11]:
env = make_atari_env('Breakout-v0',n_envs=4,seed=0)
env = VecFrameStack(env,n_stack=4)
# trianing 4 env at the same time and stack them up

In [15]:
log_path = os.path.join('Training','Logs')
model = A2C('CnnPolicy',env,verbose=1,tensorboard_log = log_path)
# in this case using Cnn is faster

Using cuda device
Wrapping the env in a VecTransposeImage.


In [16]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 286      |
|    ep_rew_mean        | 1.71     |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 100      |
|    time_elapsed       | 32       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | -0.00703 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.328    |
|    value_loss         | 0.242    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 286      |
|    ep_rew_mean        | 1.7      |
| time/                 |          |
|    fps                | 67       |
|    iterations         | 200      |
|    time_elapsed       | 59       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x20548eb3280>

: 

In [13]:
env.render()


In [14]:
env.close()

### 4.Save and Reload Model

In [None]:
a2c_path = os.join.path('Training','Save Models','A2C_Breakout_Model')
model.save(a2c_path)

In [None]:
del model

In [None]:
model.load(a2c_path,env)

### 5.Evaluate and Test

In [None]:
env = make_atari_env('Breakout-v0',n_envs=1,seed=0)
env = VecFrameStack(env,n_stack = 4)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
evaluate_policy(model,env,n_eval_episodes = 10, render = True)