Dependencies

In [3]:
import gymnasium as gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

Let's look at the environment

In [2]:
environment_name = "Breakout-v4"
env = gym.make(environment_name, render_mode='human')

In [7]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, _ = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:0.0
Episode:2 Score:1.0
Episode:3 Score:1.0
Episode:4 Score:2.0
Episode:5 Score:2.0


In [8]:
env.action_space

Discrete(4)

In [9]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

Vectorising and training

In [44]:
env = make_atari_env('Breakout-v4', n_envs=4, seed=0)

In [45]:
env = VecFrameStack(env, n_stack=4) # this two lines allow us to train in 4 environments at the same time

In [47]:
env.reset()
env.render(mode='human')  # here you can see 4 game windows

In [6]:
log_path = os.path.join('Training', 'Logs')

In [7]:
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [5]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 281      |
|    ep_rew_mean        | 1.56     |
| time/                 |          |
|    fps                | 91       |
|    iterations         | 100      |
|    time_elapsed       | 21       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | -0.303   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0206  |
|    value_loss         | 0.00736  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 281      |
|    ep_rew_mean        | 1.61     |
| time/                 |          |
|    fps                | 87       |
|    iterations         | 200      |
|    time_elapsed       | 45       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x16c99ab33d0>

Saving

In [8]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_model')

In [7]:
model.save(a2c_path)

In [9]:
del model

In [10]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


Eval

In [36]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(6.1, 1.7)

In [42]:
env = make_atari_env('Breakout-v4', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4) # for visual validation

In [43]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset() # initial set of observations (see below)
    terminated = False
    score = 0

    while not terminated:
        env.render(mode='human') # for visual
        action, _ = model.predict(state) # model prediction
        state, reward, terminated, info = env.step(action)
        score+=reward
