<a href="https://colab.research.google.com/github/Isaiah-Essien/deep_q_network_formative_group2/blob/main/DQN_agent_formative_group2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q-learning With Atari

In [1]:
#installs
%%capture
!pip install stable-baselines3[extra]
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install ale-py
!pip install opencv-python

In [2]:
#--------------------All imports----------------
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
import ale_py
import torch
import time
from gymnasium.wrappers import RecordVideo

In [None]:
#Add the device setup o cuda

device=torch.device('cuda' if torch.cuda.is_available else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
#---------train.py file--------------------
gym.register_envs(ale_py)
#**Set the Atri environment
env_id='ALE/Breakout-v5'
env=gym.make(env_id, render_mode='rgb_array')

#**Seperate the evaluation environment

env_eval=gym.make(env_id, render_mode='rgb_array')

#** Experiment with policies
policy_type='CnnPolicy'


#*Define the DQN agent model
model=DQN(
    policy=policy_type,
    env=env,
    learning_rate=1e-5,
    buffer_size=50000,
    learning_starts=50000,
    batch_size=32,
    gamma=0.99,
    exploration_fraction=0.1,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.02,
    verbose=1,
    train_freq=5,
    device=device,
    tensorboard_log='./dqn_agent_tensorboard_log/'
)

#** Evaluate the callback

eval_callback=EvalCallback(
    env_eval,
    best_model_save_path='./logs/best_model/',
    log_path='./logs/results/',
    eval_freq=50000,
    n_eval_episodes=7,
    deterministic=True,
    render=False
)

# Train the model
timesteps = 1000000
model.learn(total_timesteps=timesteps, progress_bar=False, callback=eval_callback)

# Delete the replay buffer from memory before saving
model.replay_buffer = None

# Save the model without replay buffer
model.save('dqn_agent_breakout_final')

#*evaluate the trained model
mean_reward, std_reward=evaluate_policy(
    model,
    env_eval,
    n_eval_episodes=15,
    deterministic=True
)
print(f'final reward: {mean_reward}+/-{std_reward}')



Eval num_timesteps=50000, episode_reward=0.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


New best mean reward!


Eval num_timesteps=100000, episode_reward=0.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=150000, episode_reward=2.14 +/- 0.35
Episode length: 27000.00 +/- 0.00


New best mean reward!


Eval num_timesteps=200000, episode_reward=2.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=250000, episode_reward=0.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=300000, episode_reward=0.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=350000, episode_reward=0.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=400000, episode_reward=0.00 +/- 0.00
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=450000, episode_reward=1.57 +/- 1.05
Episode length: 19348.43 +/- 12098.21


Eval num_timesteps=500000, episode_reward=2.00 +/- 0.53
Episode length: 27000.00 +/- 0.00


Eval num_timesteps=550000, episode_reward=2.43 +/- 1.76
Episode length: 4030.29 +/- 9377.53


New best mean reward!


Eval num_timesteps=600000, episode_reward=3.14 +/- 0.64
Episode length: 4051.71 +/- 9368.64


New best mean reward!


Eval num_timesteps=650000, episode_reward=4.00 +/- 2.07
Episode length: 4062.14 +/- 9364.45


New best mean reward!


Eval num_timesteps=700000, episode_reward=2.57 +/- 0.73
Episode length: 4025.43 +/- 9379.36


Eval num_timesteps=750000, episode_reward=4.29 +/- 2.55
Episode length: 260.43 +/- 66.48


New best mean reward!


Eval num_timesteps=800000, episode_reward=2.86 +/- 1.25
Episode length: 11710.43 +/- 13241.17


Eval num_timesteps=850000, episode_reward=3.29 +/- 1.16
Episode length: 7881.00 +/- 12092.00


Eval num_timesteps=900000, episode_reward=3.86 +/- 0.99
Episode length: 4069.29 +/- 9361.49


Eval num_timesteps=950000, episode_reward=3.57 +/- 0.49
Episode length: 250.71 +/- 32.86


Eval num_timesteps=1000000, episode_reward=3.86 +/- 1.12
Episode length: 4074.00 +/- 9359.60


final reward: 5.333333333333333+/-1.349897115421106


In [None]:
#---------------------training with MLP policy----------------
#---------train.py file--------------------
gym.register_envs(ale_py)
#**Set the Atri environment
env_id='ALE/Breakout-v5'
env=gym.make(env_id, render_mode='rgb_array')

#**Seperate the evaluation environment

env_eval=gym.make(env_id, render_mode='rgb_array')

#** Experiment with policies
policy_type='MlpPolicy'


#*Define the DQN agent model
model=DQN(
    policy=policy_type,
    env=env,
    learning_rate=1e-5,
    buffer_size=10000,
    learning_starts=50000,
    batch_size=32,
    gamma=0.99,
    exploration_fraction=0.1,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.02,
    verbose=1,
    train_freq=5,
    device=device,
    tensorboard_log='./dqn_agent_tensorboard_log/'
)

#** Evaluate the callback

eval_callback=EvalCallback(
    env_eval,
    best_model_save_path='./logs/best_model/',
    log_path='./logs/results/',
    eval_freq=50000,
    n_eval_episodes=7,
    deterministic=True,
    render=False
)

# Train the model
timesteps = 500000
model.learn(total_timesteps=timesteps, progress_bar=False, callback=eval_callback)

# Delete the replay buffer from memory before saving
model.replay_buffer = None

# Save the model without replay buffer
model.save('dqn_agent_breakout_mlp_final')

#*evaluate the trained model
mean_reward, std_reward=evaluate_policy(
    model,
    env_eval,
    n_eval_episodes=15,
    deterministic=True
)
print(f'final reward: {mean_reward}+/-{std_reward}')

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./dqn_agent_tensorboard_log/DQN_5




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 169      |
|    ep_rew_mean      | 1        |
|    exploration_rate | 0.987    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 828      |
|    time_elapsed     | 0        |
|    total_timesteps  | 677      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 165      |
|    ep_rew_mean      | 0.875    |
|    exploration_rate | 0.974    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 835      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1322     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 166      |
|    ep_rew_mean      | 0.917    |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes       



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 242      |
|    ep_rew_mean      | 1.96     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 460      |
|    fps              | 147      |
|    time_elapsed     | 699      |
|    total_timesteps  | 102941   |
| train/              |          |
|    learning_rate    | 1e-05    |
|    loss             | 0.0138   |
|    n_updates        | 10588    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 242      |
|    ep_rew_mean      | 1.98     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 464      |
|    fps              | 147      |
|    time_elapsed     | 702      |
|    total_timesteps  | 103937   |
| train/              |          |
|    learning_rate    | 1

In [None]:
import os
from google.colab import files

model_name = 'dqn_agent_breakout_cnn_final.zip'

# Check if a zipped file already exists
zip_filename = f'{model_name}.zip'

if not os.path.exists(zip_filename):
    print("Zip file not found, zipping now...")
    !zip -r {zip_filename} {model_name}*
else:
    print("Zip file already exists, skipping zip step...")

# Download the zip file
files.download(zip_filename)


Zip file already exists, skipping zip step...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
#---------------------Load model from Drive----------
from google.colab import drive
drive.mount('/content/drive')

model_path='/content/drive/MyDrive/ALU/dqn_agent_breakout_cnn_final.zip'

Mounted at /content/drive


In [5]:
#--------------------------Play.py script----------

#**Load the environment

env_id='ALE/Breakout-v5'
# env=gym.make(env_id, render_mode='human')

env=gym.make(env_id,render_mode='rgb_array')
# Wrap environment for recording videos (saved in "./videos/" folder)
env = RecordVideo(env, video_folder='./videos/', episode_trigger=lambda x: True)


#**Load the trained model

model=DQN.load(model_path)

#Play 2 episodes
episodes=5


for episode in range (episodes):
  obs, info=env.reset()
  done=False
  total_reward=0

  while not done:
    action,_states=model.predict(obs, deterministic=True)
    obs, reward,terminated,truncated,info=env.step(action)
    done=terminated or truncated
    total_reward+=reward

  print(f'Episodes {episode+1}\n Reward: {total_reward}')

env.close()


Episodes 1
 Reward: 3.0


  """


Episodes 2
 Reward: 4.0
Episodes 3
 Reward: 8.0
Episodes 4
 Reward: 3.0
Episodes 5
 Reward: 4.0
