<a href="https://colab.research.google.com/github/Maxime-Bakunzi/deep_q_learning-formative/blob/main/deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install gymnasium[atari]
!pip install "autorom[accept-rom-license]"
!pip install ale-py
!pip install stable-baselines3



In [6]:
import gymnasium as gym
import numpy as np
import ale_py
from stable_baselines3 import DQN
from stable_baselines3.common.atari_wrappers import AtariWrapper  # For proper Atari pre-processing
from stable_baselines3.common.callbacks import BaseCallback

# Custom Callback to Log Training Details and Metrics

In [7]:
class TrainingLogger(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainingLogger, self).__init__(verbose)
        self.episode_rewards = []
        self.episode_lengths = []
        self.current_rewards = 0
        self.current_length = 0

    def _on_step(self) -> bool:
        # Increase counters at every step
        self.current_length += 1
        # Check if 'infos' contains an 'episode' key, then log the metrics
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" in info:
                episode_info = info["episode"]
                self.episode_rewards.append(episode_info["r"])
                self.episode_lengths.append(episode_info["l"])
                print(f"Episode Reward: {episode_info['r']:.2f} | Length: {episode_info['l']}")
                # You can also log additional metrics or compute running averages here
        return True

    def _on_rollout_end(self):
        # At the end of each rollout, you might calculate the average reward over recent episodes
        if self.episode_rewards:
            avg_reward = np.mean(self.episode_rewards[-10:])
            print(f"Average Reward (last 10 episodes): {avg_reward:.2f}")

# Environment Setup

In [8]:
# Use the Boxing environment with the specified configuration
env_id = "ALE/Boxing-v5"
# For training, we do not need to render (render_mode=None)
env = gym.make(env_id, render_mode=None)
# Apply the Atari wrappers to preprocess frames (e.g., resizing, frame stacking, etc.)
env = AtariWrapper(env)

# Define Hyperparameters

## I. Hyperparameters

In [9]:
# These are 1st hyperparameters.
learning_rate = 1e-4
gamma = 0.99
batch_size = 32
# Epsilon parameters for exploration in DQN:
epsilon_start = 1.0
epsilon_end = 0.02
epsilon_decay = 1000000  # The number of timesteps over which epsilon decays

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define the DQN Agent

In [11]:
# Experimenting using CNN-based policy.
policy = "CnnPolicy"

model = DQN(
    policy,
    env,
    learning_rate=learning_rate,
    gamma=gamma,
    batch_size=batch_size,
    verbose=1,
    exploration_initial_eps=epsilon_start,
    exploration_final_eps=epsilon_end,
    # Adjust the exploration fraction to set decay relative to total timesteps
    exploration_fraction=epsilon_decay / 1_000_000,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


# Training the Agent


In [12]:
# Train for a total of 500,000 timesteps (adjust based on performance)
total_timesteps = 500_000
model.learn(total_timesteps=total_timesteps, callback=TrainingLogger())

# Save the trained model
model.save("/content/drive/My Drive/deep_q_learning/dqn_model.zip")
print("Model saved as dqn_model.zip")
env.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes): 10.70
Average Reward (last 10 episodes)

# Trying Other Hyperparameters

## II. Hyperparameter Set

In [13]:
# Experimenting with these parameters.
learning_rate = 0.001
gamma = 0.99
batch_size = 32
# Epsilon parameters for exploration in DQN:
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995 * 1000000  # The number of timesteps over which epsilon decays

In [14]:
# Experimenting with an MLP-based policy.
policy = "MlpPolicy"

model = DQN(
    policy,
    env,
    learning_rate=learning_rate,
    gamma=gamma,
    batch_size=batch_size,
    verbose=1,
    exploration_initial_eps=epsilon_start,
    exploration_final_eps=epsilon_end,
    # Adjust the exploration fraction to set decay relative to total timesteps
    exploration_fraction=epsilon_decay / 1_000_000,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


### Training the Agent

In [15]:
# Train for a total of 500,000 timesteps (adjust based on performance)
total_timesteps = 300_000
model.learn(total_timesteps=total_timesteps, callback=TrainingLogger())

# Save the trained model
model.save("/content/drive/My Drive/deep_q_learning/dqn_model2.zip")
print("Model saved as dqn_model.zip")
env.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Reward (last 10 episodes): -18.80
Average Rew

## III. Hyperparameter Set

In [16]:
# Experimenting with these parameters.
learning_rate = 0.002
gamma = 0.99
batch_size = 64
# Epsilon parameters for exploration in DQN:
epsilon_start = 1.0
epsilon_end = 0.05
epsilon_decay = 0.99 * 1000000  # The number of timesteps over which epsilon decays

In [17]:
# Experimenting with an CNN-based policy.
policy = "CnnPolicy"

model = DQN(
    policy,
    env,
    learning_rate=learning_rate,
    gamma=gamma,
    batch_size=batch_size,
    verbose=1,
    exploration_initial_eps=epsilon_start,
    exploration_final_eps=epsilon_end,
    # Adjust the exploration fraction to set decay relative to total timesteps
    exploration_fraction=epsilon_decay / 1_000_000,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


### Training the Agent

In [18]:
# Train for a total of 500,000 timesteps (adjust based on performance)
total_timesteps = 250_000
model.learn(total_timesteps=total_timesteps, callback=TrainingLogger())

# Save the trained model
model.save("/content/drive/My Drive/deep_q_learning/dqn_model3.zip")
print("Model saved as dqn_model.zip")
env.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes): -1.00
Average Reward (last 10 episodes)