# **1. Import dependencies**

In [None]:
!pip install stable-baselines3[extra]

In [4]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

# **2. Test environment**

In [15]:
environment_name = "Breakout-v4"
env = gym.make(environment_name, render_mode = 'rgb_array')

In [16]:
env.action_space

Discrete(4)

In [17]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [None]:
episodes = 5
for episode in range(1,episodes+1):
  obs = env.reset()
  done = False
  score = 0

  while not done:
    env.render()
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)[:4]
    score = score + reward
  print("Episode:{} Score:{}".format(episode,score))

env.close()

# **3. Vectorise Environment and Train Model**

In [26]:
#Vectorizing the environment, training the agent on 4 parallel environments
env = make_atari_env('Breakout-v4', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack = 4)

  and should_run_async(code)


In [27]:
log_path = os.path.join('Desktop')

In [28]:
model = A2C('CnnPolicy', env, verbose = 1, tensorboard_log=log_path)

  and should_run_async(code)


Using cpu device
Wrapping the env in a VecTransposeImage.


In [29]:
model.learn(total_timesteps=100000)

Logging to Desktop/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 284      |
|    ep_rew_mean        | 1.57     |
| time/                 |          |
|    fps                | 101      |
|    iterations         | 100      |
|    time_elapsed       | 19       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 0.145    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0218  |
|    value_loss         | 0.549    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 284      |
|    ep_rew_mean        | 1.61     |
| time/                 |          |
|    fps                | 110      |
|    iterations         | 200      |
|    time_elapsed       | 36       |
|    total_timesteps    | 4000     |
| train/     

<stable_baselines3.a2c.a2c.A2C at 0x7c5c605ee3e0>

# **4. Testing**

In [38]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)

(7.2, 1.077032961426901)