<a href="https://colab.research.google.com/github/Maxime-Bakunzi/deep_q_learning-formative/blob/main/deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
!pip install gymnasium[atari]
!pip install "autorom[accept-rom-license]"
!pip install ale-py
!pip install stable-baselines3



In [33]:
import gymnasium as gym
import numpy as np
import ale_py
from stable_baselines3 import DQN
from stable_baselines3.common.atari_wrappers import AtariWrapper  # For proper Atari pre-processing
from stable_baselines3.common.callbacks import BaseCallback

# Custom Callback to Log Training Details and Metrics

In [34]:
class TrainingLogger(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainingLogger, self).__init__(verbose)
        self.episode_rewards = []
        self.episode_lengths = []
        self.current_rewards = 0
        self.current_length = 0

    def _on_step(self) -> bool:
        # Increase counters at every step
        self.current_length += 1
        # Check if 'infos' contains an 'episode' key, then log the metrics
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" in info:
                episode_info = info["episode"]
                self.episode_rewards.append(episode_info["r"])
                self.episode_lengths.append(episode_info["l"])
                print(f"Episode Reward: {episode_info['r']:.2f} | Length: {episode_info['l']}")

        return True

# Environment Setup

In [35]:
# Use the Boxing environment with the specified configuration
env_id = "ALE/Boxing-v5"
# For training, we do not need to render (render_mode=None)
env = gym.make(env_id, render_mode=None)
# Apply the Atari wrappers to preprocess frames (e.g., resizing, frame stacking, etc.)
env = AtariWrapper(env)

# Define Hyperparameters

## I. Hyperparameters

In [36]:
# These are 1st hyperparameters.
learning_rate = 1e-4
gamma = 0.99
batch_size = 32
# Epsilon parameters for exploration in DQN:
epsilon_start = 1.0
epsilon_end = 0.02
epsilon_decay = 1000000  # The number of timesteps over which epsilon decays

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define the DQN Agent

In [38]:
# Experimenting using CNN-based policy.
policy = "CnnPolicy"

model = DQN(
    policy,
    env,
    learning_rate=learning_rate,
    gamma=gamma,
    batch_size=batch_size,
    verbose=1,
    exploration_initial_eps=epsilon_start,
    exploration_final_eps=epsilon_end,
    # Adjust the exploration fraction to set decay relative to total timesteps
    exploration_fraction=epsilon_decay / 1_000_000,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


# Training the Agent


In [39]:
# Train for a total of 500,000 timesteps (adjust based on performance)
total_timesteps = 500_000
model.learn(total_timesteps=total_timesteps, callback=TrainingLogger())

# Save the trained model
model.save("/content/drive/My Drive/deep_q_learning/dqn_model.zip")
print("Model saved as dqn_model.zip")
env.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode Reward: -21.00 | Length: 444
Episode Reward: -9.00 | Length: 441
Episode Reward: -5.00 | Length: 442
Episode Reward: 0.00 | Length: 441
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 441      |
|    ep_rew_mean      | -6.06    |
|    exploration_rate | 0.927    |
| time/               |          |
|    episodes         | 84       |
|    fps              | 182      |
|    time_elapsed     | 202      |
|    total_timesteps  | 37022    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0666   |
|    n_updates        | 9230     |
----------------------------------
Episode Reward: -14.00 | Length: 444
Episode Reward: 0.00 | Length: 440
Episode Reward: -15.00 | Length: 441
Episode Reward: -5.00 | Length: 438
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 441      |
|    ep_rew_mean

# Trying Other Hyperparameters

## II. Hyperparameter Set

In [40]:
# Experimenting with these parameters.
learning_rate = 0.001
gamma = 0.99
batch_size = 32
# Epsilon parameters for exploration in DQN:
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.98 * 1000000  # The number of timesteps over which epsilon decays

In [41]:
# Experimenting with an MLP-based policy.
policy = "MlpPolicy"

model = DQN(
    policy,
    env,
    learning_rate=learning_rate,
    gamma=gamma,
    batch_size=batch_size,
    verbose=1,
    exploration_initial_eps=epsilon_start,
    exploration_final_eps=epsilon_end,
    # Adjust the exploration fraction to set decay relative to total timesteps
    exploration_fraction=epsilon_decay / 1_000_000,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


### Training the Agent

In [42]:
# Train for a total of 100,000 timesteps (adjust based on performance)
total_timesteps = 100_000
model.learn(total_timesteps=total_timesteps, callback=TrainingLogger())

# Save the trained model
model.save("/content/drive/My Drive/deep_q_learning/dqn_model2.zip")
print("Model saved as dqn_model2.zip")
env.close()

Episode Reward: -13.00 | Length: 330
Episode Reward: -11.00 | Length: 444
Episode Reward: 0.00 | Length: 443
Episode Reward: -7.00 | Length: 444
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 415      |
|    ep_rew_mean      | -7.75    |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 217      |
|    time_elapsed     | 7        |
|    total_timesteps  | 1661     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.0388   |
|    n_updates        | 390      |
----------------------------------
Episode Reward: -3.00 | Length: 443
Episode Reward: -20.00 | Length: 441
Episode Reward: -5.00 | Length: 443
Episode Reward: -3.00 | Length: 444
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 429      |
|    ep_rew_mean      | -7.75    |
|    exploration_rate | 0.965    |
| time/   

## III. Hyperparameter Set

In [43]:
# Experimenting with these parameters.
learning_rate = 0.002
gamma = 0.99
batch_size = 64
# Epsilon parameters for exploration in DQN:
epsilon_start = 1.0
epsilon_end = 0.05
epsilon_decay = 0.99 * 1000000  # The number of timesteps over which epsilon decays

In [44]:
# Experimenting with an CNN-based policy.
policy = "CnnPolicy"

model = DQN(
    policy,
    env,
    learning_rate=learning_rate,
    gamma=gamma,
    batch_size=batch_size,
    verbose=1,
    exploration_initial_eps=epsilon_start,
    exploration_final_eps=epsilon_end,
    # Adjust the exploration fraction to set decay relative to total timesteps
    exploration_fraction=epsilon_decay / 1_000_000,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


### Training the Agent

In [45]:
# Train for a total of 110,000 timesteps (adjust based on performance)
total_timesteps = 110_000
model.learn(total_timesteps=total_timesteps, callback=TrainingLogger())

# Save the trained model
model.save("/content/drive/My Drive/deep_q_learning/dqn_model3.zip")
print("Model saved as dqn_model3.zip")
env.close()

Episode Reward: 0.00 | Length: 400
Episode Reward: 6.00 | Length: 440
Episode Reward: -10.00 | Length: 439
Episode Reward: -13.00 | Length: 441
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 430      |
|    ep_rew_mean      | -4.25    |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 198      |
|    time_elapsed     | 8        |
|    total_timesteps  | 1720     |
| train/              |          |
|    learning_rate    | 0.002    |
|    loss             | 0.0422   |
|    n_updates        | 404      |
----------------------------------
Episode Reward: -1.00 | Length: 438
Episode Reward: -10.00 | Length: 438
Episode Reward: -17.00 | Length: 441
Episode Reward: -6.00 | Length: 439
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 434      |
|    ep_rew_mean      | -6.38    |
|    exploration_rate | 0.97     |
| time/   

In [47]:
!apt-get install -y libsdl2-dev libsdl2-image-dev libsdl2-mixer-dev libsdl2-ttf-dev
!pip install pyvirtualdisplay

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  gir1.2-ibus-1.0 libdbus-1-dev libdecor-0-dev libdrm-dev libegl-dev libegl1-mesa-dev
  libfluidsynth3 libgbm-dev libgl-dev libgles-dev libgles1 libglu1-mesa libglu1-mesa-dev
  libglvnd-core-dev libglvnd-dev libglx-dev libibus-1.0-5 libibus-1.0-dev libinstpatch-1.0-2
  libmodplug1 libopengl-dev libopusfile0 libpciaccess-dev libpulse-dev libpulse-mainloop-glib0
  libsdl2-image-2.0-0 libsdl2-mixer-2.0-0 libsdl2-ttf-2.0-0 libsndio-dev libudev-dev libwayland-bin
  libwayland-dev libxcursor-dev libxfixes-dev libxi-dev libxinerama-dev libxkbcommon-dev
  libxrandr-dev libxt-dev libxv-dev libxxf86vm-dev timgm6mb-soundfont
Suggested packages:
  libwayland-doc libxt-doc fluid-soundfont-gm
The following NEW packages will be installed:
  gir1.2-ibus-1.0 libdbus-1-dev libdecor-0-dev libdrm-dev libegl-dev libegl1-mesa-dev
  libfluidsynth3 libgbm-dev li

In [49]:
!apt-get install -y xvfb # install Xvfb

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common
The following NEW packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common xvfb
0 upgraded, 9 newly installed, 0 to remove and 29 not upgraded.
Need to get 7,814 kB of archives.
After this operation, 12.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libfontenc1 amd64 1:1.1.4-1build3 [14.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxfont2 amd64 1:2.0.5-1build1 [94.5 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxkbfile1 amd64 1:1.1.0-1build3 [71.8 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 x11-xkb-utils amd64 7.7+5build4 [172 kB]
Get:5 http://archiv

In [1]:
import gymnasium as gym
import time
import ale_py
from stable_baselines3 import DQN
from stable_baselines3.common.atari_wrappers import AtariWrapper
from gymnasium.wrappers import RecordVideo  # Import the video recorder wrapper
from pyvirtualdisplay import Display # Import Display from pyvirtualdisplay

# Initialize virtual display
display = Display(visible=0, size=(1400, 900))
display.start()

# ------------------------------
# Load the Trained Model
# ------------------------------
model = DQN.load("/content/drive/My Drive/deep_q_learning/dqn_model.zip")

# ------------------------------
# Environment Setup for Evaluation with Video Recording
# ------------------------------
env_id = "ALE/Boxing-v5"
# Create the environment with render_mode "rgb_array" to capture frames.
env = gym.make(env_id, render_mode="rgb_array")
env = AtariWrapper(env)

# Wrap the environment with RecordVideo to record every episode.
# The video files will be saved in the "videos/" folder.
env = RecordVideo(env, video_folder="/content/drive/My Drive/deep_q_learning/videos/", episode_trigger=lambda episode_id: True)

# For visualizing the game in real-time, we create a separate environment instance for rendering.
render_env = gym.make(env_id, render_mode="human")
render_env = AtariWrapper(render_env)

# ------------------------------
# Playing with the Agent using a Greedy Policy
# ------------------------------
episodes = 5  # Number of evaluation episodes

for ep in range(episodes):
    obs, info = env.reset()
    render_env.reset()  # Reset the render environment as well
    done = False
    episode_reward = 0
    while not done:
        # Choose the best action deterministically (greedy)
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward

        # Render the environment in real time (from the separate render environment)
        # Correct the unpacking to handle all returned values from step
        render_obs, render_reward, render_terminated, render_truncated, render_info = render_env.step(action)
        time.sleep(0.02)  # Slow down for visualization

        if done or truncated:
            break
    print(f"Episode {ep+1} Reward: {episode_reward}")

env.close()
render_env.close()

  logger.warn(


Episode 1 Reward: 9.0


  """


Episode 2 Reward: 16.0
Episode 3 Reward: 8.0
Episode 4 Reward: 31.0
Episode 5 Reward: 6.0


## Merge Videos

In [2]:
!pip install moviepy



In [3]:
import os
from moviepy.editor import VideoFileClip, concatenate_videoclips

def merge_videos(video_folder, output_path):
    """
    Merges all video files in a folder into a single output video.

    Args:
        video_folder: Path to the folder containing the video files.
        output_path: Path to save the merged video file.
    """
    video_files = [f for f in os.listdir(video_folder) if f.endswith(('.mp4', '.avi'))]  # Add other supported formats if needed
    video_files.sort()  # Sort files to maintain order
    clips = [VideoFileClip(os.path.join(video_folder, f)) for f in video_files]
    final_clip = concatenate_videoclips(clips)
    final_clip.write_videofile(output_path, codec="libx264", fps=24)  # Adjust codec and fps as needed

# Example usage:
video_folder = "/content/drive/My Drive/deep_q_learning/videos/"  # Path to your video folder
output_path = "/content/drive/My Drive/deep_q_learning/merged_video.mp4"  # Path to save the merged video
merge_videos(video_folder, output_path)

  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  from scipy.ndimage.filters import sobel

  if event.key is 'enter':

  from pkg_resources import resource_stream, resource_exists

Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)

Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)

Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)



Moviepy - Building video /content/drive/My Drive/deep_q_learning/merged_video.mp4.
Moviepy - Writing video /content/drive/My Drive/deep_q_learning/merged_video.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/My Drive/deep_q_learning/merged_video.mp4


