<a href="https://colab.research.google.com/github/MaxiStahl1992/rl-pong/blob/main/RL_Pong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries

In [2]:
!apt install swig cmake
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit1/requirements-unit1.txt
!pip install gymnasium[atari]
!pip install stable_baselines3
!pip install ale-py
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay
# Install required packages with ROM license acceptance
!pip install "gymnasium[accept-rom-license]"
!pip install "autorom[accept-rom-license]"

# Use AutoROM to install the Atari ROMs
!AutoROM --accept-license

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 49 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 0s (2,975 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 123605 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

The notebook needs to be restarted to ensure the virtual display will work.

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7a08d0525600>

# Imports

In [20]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import random

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.atari_wrappers import AtariWrapper
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback

from huggingface_sb3 import push_to_hub, package_to_hub
from huggingface_hub import notebook_login

import torch
import torch.nn as nn
import torch.optim as optim

# Set Seed

In [21]:
seed = 73
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7a089684aff0>

# Set device to GPU if available

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


# Set General Environment Params

In [23]:
num_envs = 8
env_id = "ALE/Pong-v5"

In [24]:
print(gym.envs.registry.get(env_id))

EnvSpec(id='ALE/Pong-v5', entry_point='shimmy.atari_env:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'game': 'pong', 'obs_type': 'rgb', 'repeat_action_probability': 0.25, 'full_action_space': False, 'frameskip': 4, 'max_num_frames_per_episode': 108000}, namespace='ALE', name='Pong', version=5, additional_wrappers=(), vector_entry_point=None)


# Stable Baselines Implementation

## PPO and A2C Implementation

### Create Vectorized Environments for PPO and A2C

In [25]:
def make_env(env_id, seed):
  def _init():
    env = gym.make(env_id)
    env = AtariWrapper(env, clip_reward=True)
    env = Monitor(env)
    env.seed(seed)
    env.action_space.seed(seed)
    return env
  return _init

envs = DummyVecEnv([make_env(env_id, seed + i) for i in range(num_envs)])

### Create Evaluation Environment for PPO and A2C

In [26]:
eval_env = gym.make(env_id)
eval_env = AtariWrapper(eval_env, clip_reward=False)
eval_env = Monitor(eval_env)
eval_env.reset(seed=seed)
eval_env.action_space.seed(seed)

[73]

### Training Function with Reward Callback

In [27]:
class RewardCallback(BaseCallback):
  def __init__(self):
    super(RewardCallback, self).__init__()
    self.rewards = []

  def _on_step(self):
    if 'infos' in self.locals:
      infos = self.locals['infos']
      for info in infos:
        if 'episode' in info:
          self.rewards.append(info['episode']['r'])
    return True

In [28]:
def train_and_evaluate(model_class, envs, eval_env, total_timesteps=2000000, seed=73):
  callback = RewardCallback()
  start_time = time.time()
  model = model_class("CnnPolicy", envs, verbose=1, seed=seed, device=device)
  model.learn(total_timesteps=total_timesteps, callback=callback)
  training_time = time.time() - start_time
  mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
  return model, mean_reward, std_reward, training_time, callback.rewards

### Train PPO

In [None]:
ppo_model, ppo_mean_reward, ppo_std_reward, ppo_training_time, ppo_rewards = train_and_evaluate(PPO, envs, eval_env)

Using cpu device
Wrapping the env in a VecTransposeImage.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 215         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 69          |
|    iterations           | 2           |
|    time_elapsed         | 471         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.010019276 |
|    clip_fraction        | 0.0778      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.78       |
|    explained_variance   | -0.000107   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0737      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00417    |
|    value_loss           | 0.207       |
-----------------------------------------
------------------------------------------
| rollout/       

### Train A2C

In [None]:
a2c_model, a2c_mean_reward, a2c_std_reward, a2c_training_time, a2c_rewards = train_and_evaluate(A2C, envs, eval_env)

## DQN Implementation

### Create DQN Environment

In [None]:
def make_dqn_env(env_id, seed):
  env = gym.make(env_id)
  env = AtariWrapper(env, clip_reward=True)
  env = Monitor(env)
  env.reset(seed=seed)
  env.action_space.seed(seed)
  return env

dqn_env = make_dqn_env(env_id, seed)

### Create DQN Evaluation Environment

In [None]:
eval_env_dqn = gym.make(env_id)
eval_env_dqn = AtariWrapper(eval_env_dqn, clip_reward=False)
eval_env_dqn = Monitor(eval_env_dqn)
eval_env_dqn.reset(seed=seed)
eval_env_dqn.action_space.seed(seed)

### Training Function

In [None]:
def train_and_evaluate_dqn(env, eval_env, total_timesteps=2000000, seed=73):
  callback = RewardCallback()
  start_time = time.time()
  model = DQN(
      "CnnPolicy",
      env,
      verbose=1,
      seed=seed,
      device = device,
      buffer_size = 100000,
      learning_starts = 50000,
      target_update_interval = 10000,
      train_freq = 4,
      exploration_fraction = 0.1,
      exploration_final_eps = 0.01
      )
  model.learn(total_timesteps=total_timesteps, callback=callback)
  training_time = time.time() - start_time
  mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
  return model, mean_reward, std_reward, training_time, callback.rewards

### Train DQN

In [None]:
dqn_model, dqn_mean_reward, dqn_std_reward, dqn_training_time, dqn_rewards = train_and_evaluate_dqn(dqn_env, eval_env_dqn)

## Compare Models

In [None]:
data = {
    'Model': ['PPO', 'A2C', 'DQN'],
    'Mean Reward': [ppo_mean_reward, a2c_mean_reward, dqn_mean_reward],
    'Std Reward': [ppo_std_reward, a2c_std_reward, dqn_std_reward],
    'Training Time (s)': [ppo_training_time, a2c_training_time, dqn_training_time],
    'Total Steps': [ppo_model.num_timesteps, a2c_model.num_timesteps, dqn_model.num_timesteps]
}

df = pd.DataFrame(data)
df

### Visualize Training Rewards

In [None]:
def plot_rewards(rewards, algorithm_name):
  plt.figure(figsize=(12, 6))
  plt.plot(rewards)
  plt.xlabel('Episodes')
  plt.ylabel('Reward')
  plt.title(f'{algorithm_name} Training Rewards')
  plt.show()

In [None]:
plot_rewards(ppo_rewards, 'PPO')
plot_rewards(a2c_rewards, 'A2C')
plot_rewards(dqn_rewards, 'DQN')

## Push Models to Huggingface

In [None]:
notebook_login()

In [None]:
def push_model_to_hf(model, model_name, model_architecture, env_id, repo_id, used_eval_env, commit_message):
  repo_id = repo_id
  model.save(repo_id)
  package_to_hub(model=model,
               model_name=model_name,
               model_architecture=model_architecture,
               env_id=env_id,
               eval_env=used_eval_env,
               repo_id=repo_id,
               commit_message=commit_message)

In [None]:
push_model_to_hf(ppo_model, "ppo", "PPO", env_id, "maxstahl/ppo-Pongv5", eval_env, "PPO Training")
push_model_to_hf(a2c_model, "a2c", "A2C", env_id, "maxstahl/a2c-pongv5", eval_env, "A2C Training")
push_model_to_hf(dqn_model, "dqn", "DQN", env_id, "maxstahl/dqn-pongv5", eval_env_dqn, "DQN Training")