# 1. Imports

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import random
import json

from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.atari_wrappers import AtariWrapper
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback, CallbackList
from stable_baselines3.common.logger import configure

from huggingface_sb3 import package_to_hub
from huggingface_hub import notebook_login

import torch

In [2]:
notebook_login()

  self.comm = Comm(**args)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

  self.comm = Comm(**args)


# 2. Global parameters

In [3]:
seed = 73
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)


<torch._C.Generator at 0x321c84a90>

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [5]:
num_envs = 8
env_id = "PongNoFrameskip-v4"
training_time_max = 10800 # 10800 - 3h, 21600 - 6h, 32400 - 9h

In [6]:
print(gym.envs.registry.get(env_id))

EnvSpec(id='PongNoFrameskip-v4', entry_point='shimmy.atari_env:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'game': 'pong', 'obs_type': 'rgb', 'repeat_action_probability': 0.0, 'full_action_space': False, 'max_num_frames_per_episode': 108000, 'frameskip': 1}, namespace=None, name='PongNoFrameskip', version=4, additional_wrappers=(), vector_entry_point=None)


# 3. Environment Setup

In [7]:
def make_env(env_id, seed):
    def _init():
        env = gym.make(env_id, render_mode="rgb_array")
        env = AtariWrapper(env, clip_reward=True, terminal_on_life_loss=False)
        env = Monitor(env)
        env.seed(seed)
        env.action_space.seed(seed)
        return env
    return _init

def make_eval_env(env_id, seed):
    eval_env = gym.make(env_id, render_mode="rgb_array")
    eval_env = AtariWrapper(eval_env, clip_reward=False, terminal_on_life_loss=False)
    eval_env = Monitor(eval_env)
    eval_env.seed(seed)
    eval_env.action_space.seed(seed)
    return eval_env

def make_dqn_env(env_id, seed):
    env = gym.make(env_id, render_mode="rgb_array")
    env = AtariWrapper(env, clip_reward=False, terminal_on_life_loss=False)
    env = Monitor(env)
    env.seed(seed)
    env.action_space.seed(seed)
    return env

In [8]:
envs = SubprocVecEnv([make_env(env_id, seed + i) for i in range(num_envs)])
eval_env = make_eval_env(env_id, seed)
dqn_env = make_dqn_env(env_id, seed)
eval_env_dqn = make_eval_env(env_id, seed)

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]



A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


# 4. Callback Functions

In [9]:
class RewardCallback(BaseCallback):
    def __init__(self):
        super(RewardCallback, self).__init__()
        self.rewards = []

    def _on_step(self):
        if 'infos' in self.locals:
            infos = self.locals['infos']
            for info in infos:
                if 'episode' in info:
                    self.rewards.append(info['episode']['r'])
        return True

In [10]:
class StopTrainingOnTimeLimit(BaseCallback):
    def __init__(self, max_duration, verbose=0):
        super(StopTrainingOnTimeLimit, self).__init__(verbose)
        self.max_duration = max_duration
        self.start_time = None

    def _on_training_start(self):
        self.start_time = time.time()

    def _on_step(self):
        elapsed_time = time.time() - self.start_time
        if elapsed_time > self.max_duration:
            if self.verbose > 0:
                print(f"Time limit reached ({self.max_duration} seconds). Stopping training.")
            return False  # Returning False stops training
        return True


# 5. Training setup

In [11]:
def train_and_evaluate_ppo(envs, eval_env, seed=73):
    log_dir = f"sb3_logs/PPO_{env_id}_{training_time_max}"
    time_limit_callback = StopTrainingOnTimeLimit(max_duration=training_time_max, verbose=1)
    reward_callback = RewardCallback()
    callback = CallbackList([reward_callback, time_limit_callback])

    start_time = time.time()
    model = PPO(
        "CnnPolicy",
        envs,
        verbose=1,
        seed=seed,
        device=device,
        n_steps=128,
        batch_size=256,
        n_epochs=4,
        learning_rate=2.5e-4,
        clip_range=0.1,
        gamma=0.99,
        gae_lambda=0.95,
        ent_coef=0.01,
        vf_coef=0.5,
        max_grad_norm=0.5,
    )


    model.set_logger(configure(log_dir, ["tensorboard", "stdout"]))
    model.learn(total_timesteps=int(1e8), callback=callback)
    training_time = time.time() - start_time
    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
    total_steps = model.num_timesteps
    total_episodes = len(reward_callback.rewards)
    timesteps_per_episode = total_steps / total_episodes if total_episodes > 0 else 0
    return model, mean_reward, std_reward, training_time, reward_callback.rewards, total_steps, total_episodes, timesteps_per_episode

In [12]:
def train_and_evaluate_a2c(envs, eval_env, seed=73):
    log_dir = f"sb3_logs/A2C_{env_id}_{training_time_max}"
    time_limit_callback = StopTrainingOnTimeLimit(max_duration=training_time_max, verbose=1)
    reward_callback = RewardCallback()
    callback = CallbackList([reward_callback, time_limit_callback])

    start_time = time.time()
    model = A2C(
        "CnnPolicy",
        envs,
        verbose=1,
        seed=seed,
        device=device,
        n_steps=128,
        learning_rate=7e-4,
        gamma=0.99,
        gae_lambda=1.0,
        ent_coef=0.01,
        vf_coef=0.25,
        max_grad_norm=0.5,
        normalize_advantage=True,
        use_rms_prop=True
    )

    model.set_logger(configure(log_dir, ["tensorboard", "stdout"]))
    model.learn(total_timesteps=int(1e17), callback=callback)
    training_time = time.time() - start_time
    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
    total_steps = model.num_timesteps
    total_episodes = len(reward_callback.rewards)
    timesteps_per_episode = total_steps / total_episodes if total_episodes > 0 else 0
    return model, mean_reward, std_reward, training_time, reward_callback.rewards, total_steps, total_episodes, timesteps_per_episode

In [13]:
def train_and_evaluate_dqn(env, eval_env, seed=73):
    log_dir = f"sb3_logs/{DQN.__name__}_{env_id}_{training_time_max}"
    time_limit_callback = StopTrainingOnTimeLimit(max_duration=training_time_max, verbose=1)
    reward_callback = RewardCallback()
    callback = CallbackList([reward_callback, time_limit_callback])

    start_time = time.time()
    model = DQN(
        "CnnPolicy",
        env,
        verbose=1,
        seed=seed,
        device=device,
        buffer_size=750000,
        learning_starts=50000,
        target_update_interval=1000,
        train_freq=4,
        gradient_steps=1,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        gamma=0.99,
        learning_rate=1e-4,
        batch_size=32,
        max_grad_norm=0.5,
    )

    model.set_logger(configure(log_dir, ["tensorboard", "stdout"]))
    model.learn(total_timesteps=int(1e7), callback=callback)
    training_time = time.time() - start_time
    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
    total_steps = model.num_timesteps
    total_episodes = len(reward_callback.rewards)
    timesteps_per_episode = total_steps / total_episodes if total_episodes > 0 else 0
    return model, mean_reward, std_reward, training_time, reward_callback.rewards, total_steps, total_episodes, timesteps_per_episode


# 6. Model Training

In [14]:
dqn_model, dqn_mean_reward, dqn_std_reward, dqn_training_time, dqn_rewards, dqn_total_steps, dqn_total_episodes, dqn_timesteps_per_episode = train_and_evaluate_dqn(dqn_env, eval_env_dqn)

Using cpu device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to sb3_logs/DQN_PongNoFrameskip-v4_10800
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 916      |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2110     |
|    time_elapsed     | 1        |
|    total_timesteps  | 3663     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 899      |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2114     |
|    time_elapsed     | 3        |
|    total_timesteps  | 7195     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean  

In [15]:
ppo_model, ppo_mean_reward, ppo_std_reward, ppo_training_time, ppo_rewards, ppo_total_steps, ppo_total_episodes, ppo_timesteps_per_episode = train_and_evaluate_ppo(envs, eval_env)

Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to sb3_logs/PPO_PongNoFrameskip-v4_10800
-----------------------------
| time/              |      |
|    fps             | 2769 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1024 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 653          |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0030956718 |
|    clip_fraction        | 0.105        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.79        |
|    explained_variance   | 0.0014       |
|    learning_rate        | 0.00025      |
|    loss                 | 0.0191       |
|    n_updates            | 4            |
|    policy_gradient_loss | -0.00523  

In [21]:
a2c_model, a2c_mean_reward, a2c_std_reward, a2c_training_time, a2c_rewards, a2c_total_steps, a2c_total_episodes, a2c_timesteps_per_episode = train_and_evaluate_a2c(envs, eval_env)

Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to sb3_logs/A2C_PongNoFrameskip-v4_10800
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 907      |
|    ep_rew_mean        | -20.3    |
| time/                 |          |
|    fps                | 1139     |
|    iterations         | 100      |
|    time_elapsed       | 89       |
|    total_timesteps    | 102400   |
| train/                |          |
|    entropy_loss       | -1.7     |
|    explained_variance | 0.00416  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0346   |
|    value_loss         | 0.354    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 921      |
|    ep_rew_mean        | -20      |
| time/                 |          |
|    fps                | 1140     |
|    iterations         | 200      |
|    

# 7. Training Summary

In [17]:
data = {
    'Model': ['PPO', 'A2C', 'DQN'],
    'Mean Reward': [ppo_mean_reward, a2c_mean_reward, dqn_mean_reward],
    'Std Reward': [ppo_std_reward, a2c_std_reward, dqn_std_reward],
    'Training Time (s)': [ppo_training_time, a2c_training_time, dqn_training_time],
    'Total Steps': [ppo_total_steps, a2c_total_steps, dqn_total_steps],
    'Total Episodes': [ppo_total_episodes, a2c_total_episodes, dqn_total_episodes],
    'Timesteps per Episode': [ppo_timesteps_per_episode, a2c_timesteps_per_episode, dqn_timesteps_per_episode],
}
df = pd.DataFrame(data)
df

NameError: name 'a2c_mean_reward' is not defined

In [None]:
def plot_rewards(rewards, algorithm_name):
    plt.figure(figsize=(12, 6))
    plt.plot(rewards)
    plt.xlabel('Episodes')
    plt.ylabel('Reward')
    plt.title(f'{algorithm_name} Training Rewards')
    plt.show()

plot_rewards(ppo_rewards, 'PPO')
#plot_rewards(a2c_rewards, 'A2C')
plot_rewards(dqn_rewards, 'DQN')

# 8. Push Trained models to Huggingface Hub

In [18]:
def push_model_to_hf(model, model_name, model_architecture, env_id, repo_id, used_eval_env, commit_message):
    repo_id = repo_id
    model.save(repo_id)
    package_to_hub(model=model,
                   model_name=model_name,
                   model_architecture=model_architecture,
                   env_id=env_id,
                   eval_env=used_eval_env,
                   repo_id=repo_id,
                   commit_message=commit_message)

In [19]:
push_model_to_hf(ppo_model, f"ppo_{env_id}_{training_time_max}", "PPO", env_id, "maxstahl/ppo_pongnoframskip_v4_sb3", eval_env, f"PPO implementation duration: {training_time_max} seconds")
push_model_to_hf(a2c_model, f"a2c_{env_id}_{training_time_max}", "A2C", env_id, "maxstahl/a2c_pongnoframskip_v4_sb3", eval_env, f"A2C implementation duration: {training_time_max} seconds")
push_model_to_hf(dqn_model, f"dqn_{env_id}_{training_time_max}", "DQN", env_id, "maxstahl/dqn_pongnoframskip_v4_sb3", eval_env_dqn, f"DQN implementation duration: {training_time_max} seconds")

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m


  logger.warn(
Exception ignored in: <function VecVideoRecorder.__del__ at 0x32564d240>
Traceback (most recent call last):
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/stable_baselines3/common/vec_env/vec_video_recorder.py", line 113, in __del__
    self.close_video_recorder()
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/stable_baselines3/common/vec_env/vec_video_recorder.py", line 104, in close_video_recorder
    self.video_recorder.close()
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/gymnasium/wrappers/monitoring/video_recorder.py", line 153, in close
    from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/moviepy/video/io/ImageSequenceClip.py", line 6, in <module>
    from ..VideoClip import VideoClip
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/moviepy/video/VideoClip

Saving video to /var/folders/nx/scf6msw549j5lvz4kv2gfpz00000gn/T/tmpa8b_u4jk/-step-0-to-step-1000.mp4
[38;5;1m✘ No ffmpeg exe could be found. Install ffmpeg on your system, or set
the IMAGEIO_FFMPEG_EXE environment variable.[0m
[38;5;1m✘ We are unable to generate a replay of your agent, the package_to_hub
process continues[0m
[38;5;1m✘ Please open an issue at
https://github.com/huggingface/huggingface_sb3/issues[0m
[38;5;4mℹ Pushing repo maxstahl/ppo_pongnoframskip_v4_sb3 to the Hugging Face
Hub[0m


  logger.warn("Unable to save last video! Did you call close()?")
  self.comm = Comm(**args)


policy.optimizer.pth:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

policy.pth:   0%|          | 0.00/6.73M [00:00<?, ?B/s]

ppo_PongNoFrameskip-v4_10800.zip:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/maxstahl/ppo_pongnoframskip_v4_sb3/tree/main/[0m
[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m


  logger.warn(
Exception ignored in: <function VecVideoRecorder.__del__ at 0x32564d240>
Traceback (most recent call last):
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/stable_baselines3/common/vec_env/vec_video_recorder.py", line 113, in __del__
    self.close_video_recorder()
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/stable_baselines3/common/vec_env/vec_video_recorder.py", line 104, in close_video_recorder
    self.video_recorder.close()
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/gymnasium/wrappers/monitoring/video_recorder.py", line 153, in close
    from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/moviepy/video/io/ImageSequenceClip.py", line 6, in <module>
    from ..VideoClip import VideoClip
  File "/Users/stahlma/anaconda3/envs/gymnasium/lib/python3.10/site-packages/moviepy/video/VideoClip

Saving video to /var/folders/nx/scf6msw549j5lvz4kv2gfpz00000gn/T/tmpkfdfj4nj/-step-0-to-step-1000.mp4
[38;5;1m✘ No ffmpeg exe could be found. Install ffmpeg on your system, or set
the IMAGEIO_FFMPEG_EXE environment variable.[0m
[38;5;1m✘ We are unable to generate a replay of your agent, the package_to_hub
process continues[0m
[38;5;1m✘ Please open an issue at
https://github.com/huggingface/huggingface_sb3/issues[0m
[38;5;4mℹ Pushing repo maxstahl/dqn_pongnoframskip_v4_sb3 to the Hugging Face
Hub[0m


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

policy.optimizer.pth:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

dqn_PongNoFrameskip-v4_10800.zip:   0%|          | 0.00/27.0M [00:00<?, ?B/s]

policy.pth:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/maxstahl/dqn_pongnoframskip_v4_sb3/tree/main/[0m
