### 1. Import Dependencies

In [1]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os 
import ale_py
from ale_py import ALEInterface

### 2. Test Environment 

In [2]:
gym.register_envs(ale_py)

environment_name = 'Breakout-v4'
env = gym.make(environment_name, render_mode='human')

A.L.E: Arcade Learning Environment (version 0.11.2+ecc1138)
[Powered by Stella]


In [3]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], shape=(210, 160, 3), dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})

In [None]:
env.action_space  

Discrete(4)

In [6]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [7]:
episodes = 5 
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    terminated = False
    truncated = False

    while not (terminated or truncated):
        env.render()
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
    print(f"Episode:{episode} Score:{score}")
env.close()

Episode:1 Score:2.0
Episode:2 Score:1.0
Episode:3 Score:0.0
Episode:4 Score:0.0
Episode:5 Score:3.0


### 3. Vectorise Environement and Train Model 

In [4]:
env = make_atari_env(environment_name, n_envs=4, seed=42)
env = VecFrameStack(env, n_stack=4)

In [5]:
log_path = os.path.join('..', 'Training', 'Logs', environment_name)
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [23]:
model.learn(total_timesteps=100000)

Logging to ../Training/Logs/Breakout-v4/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 299      |
|    ep_rew_mean        | 1.97     |
| time/                 |          |
|    fps                | 438      |
|    iterations         | 100      |
|    time_elapsed       | 4        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 0.326    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.016   |
|    value_loss         | 0.00265  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 282      |
|    ep_rew_mean        | 1.63     |
| time/                 |          |
|    fps                | 434      |
|    iterations         | 200      |
|    time_elapsed       | 9        |
|    total_timesteps    | 400

<stable_baselines3.a2c.a2c.A2C at 0x7fccc99c38e0>

### 4. Save and Reload Model 

In [6]:
a2c_path = os.path.join('..', 'Training', 'Saved Models', environment_name, 'A2C_Breakout_Model')
a2c_path

'../Training/Saved Models/Breakout-v4/A2C_Breakout_Model'

In [25]:
model.save(a2c_path)



In [7]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


### 5. Evaluate and Test

In [8]:
env.reset()

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [34]:
env = make_atari_env(environment_name, n_envs=1)
env = VecFrameStack(env, n_stack=4)



evaluate_policy(model, env, n_eval_episodes = 10, render = True)

(np.float64(7.6), np.float64(2.0591260281974))

In [33]:
vec_env = model.get_env() 
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True) 
    obs, rewards, dones, info = vec_env.step(action) 
    vec_env.render("human")

KeyboardInterrupt: 

### 6. Tes in gymnasium Environment

In [32]:
import time

vec_env = model.get_env() 
obs = vec_env.reset()
episodes = 5 
for episode in range(1, episodes+1):
    done = False
    score = 0 

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = vec_env.step(action)
        print(rewards)
        score += rewards
        vec_env.render("human")
        time.sleep(0.003)
        done = dones[0]
    print(f"Episode:{episode} Score:{score}")
env.close()

[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[1. 0. 1. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[1. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 1. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[1. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[1. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 1. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
Episode:1 Score:[4. 3. 3. 3.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
Episode:2 Score:[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
Episode:3 Score:[0. 0. 0. 1.]
[0. 0. 1. 0.]
