# 1. Imports

In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random
import json
import pandas as pd

from stable_baselines3.common.evaluation import evaluate_policy

from huggingface_hub import notebook_login, upload_file

import torch
import torch.nn as nn
import torch.optim as optim

from collections import deque

In [2]:
notebook_login()

  self.comm = Comm(**args)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

  self.comm = Comm(**args)


# Global Configurations

In [3]:
# Set Seed
seed = 73

def set_global_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

set_global_seeds(seed)

In [4]:
# Set device type
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [5]:
# Set Environment
num_envs = 8
env_id = "PongNoFrameskip-v4"
training_time_max = 32400 # 10800 - 3h, 21600 - 6h, 32400 - 9h

In [6]:
print(gym.envs.registry.get(env_id))

EnvSpec(id='PongNoFrameskip-v4', entry_point='shimmy.atari_env:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'game': 'pong', 'obs_type': 'rgb', 'repeat_action_probability': 0.0, 'full_action_space': False, 'max_num_frames_per_episode': 108000, 'frameskip': 1}, namespace=None, name='PongNoFrameskip', version=4, additional_wrappers=(), vector_entry_point=None)


# Common Wrappers and Utilities

In [7]:
class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env: gym.Env):
        super().__init__(env)
    
    def reward(self, reward: float) -> float:
        return np.sign(reward)

def AtariWrappers(env, frame_skip):
    env = gym.wrappers.AtariPreprocessing(
        env,
        noop_max=30,
        frame_skip=frame_skip,
        screen_size=84,
        terminal_on_life_loss=False,
        grayscale_obs=True,
        scale_obs=False,
    )
    env = gym.wrappers.FrameStack(env, 4)
    return env

def make_env(gym_id, seed, frame_skip, clip_rewards):
    def thunk():
        env = gym.make(gym_id, render_mode='rgb_array', frameskip=1)
        env = AtariWrappers(env, frame_skip)
        if clip_rewards:
            env = ClipRewardEnv(env)
        env.reset(seed=seed)
        env.action_space.seed(seed)
        return env
    return thunk

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
    nn.init.constant_(layer.bias, bias_const)
    return layer

class Config:
    def __init__(self, dictionary):
        for key, value in dictionary.items():
            setattr(self, key, value)

# PPO Implementation

## Configurations

In [8]:
config_dict_ppo = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'gym_id': env_id,
    'total_timesteps': 1e7,
    'n_envs': 8,
    'n_steps': 128,
    'n_minibatches': 4,
    'update_epochs': 4,
    'frame_skip': 4,
    'hidden_size': 512,
    'learning_rate': 2.5e-4,
    'anneal_lr': False,
    'gamma': 0.99,
    'gae': True,
    'gae_lambda': 0.95,
    'clip_coef': 0.1,
    'norm_advantages': True,
    'clip_value_loss': True,
    'weight_value_loss': 0.5,
    'weight_ent_loss': 0.01,
    'max_grad_norm': 0.5,
    'time_limit': training_time_max, 
    'seed': seed
}

config_dict_ppo['batch_size'] = int(config_dict_ppo['n_envs'] * config_dict_ppo['n_steps'])
config_dict_ppo['minibatch_size'] = int(config_dict_ppo['batch_size'] // config_dict_ppo['n_minibatches'])

config_ppo = Config(config_dict_ppo)


## Environment

In [9]:
# Create Vectorized Environments for PPO
envs_ppo = gym.vector.AsyncVectorEnv([
    make_env(config_ppo.gym_id, seed + i, config_ppo.frame_skip, clip_rewards=True) for i in range(config_ppo.n_envs)
])

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


## Neural Network

In [10]:
class PPONetwork(nn.Module):
    def __init__(self, input_shape, n_actions, hidden_size=512):
        super().__init__()
        self.base = nn.Sequential(
            layer_init(nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64 * 7 * 7, hidden_size)),
            nn.ReLU()
        )
        self.actor = layer_init(nn.Linear(hidden_size, n_actions), std=0.01)
        self.critic = layer_init(nn.Linear(hidden_size, 1), std=1.0)

    def forward(self, x, action=None):
        x = x / 255.0  # Normalize pixel values
        x = self.base(x)
        logits = self.actor(x)
        value = self.critic(x)
        dist = torch.distributions.Categorical(logits=logits)
        if action is None:
            action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        return action, log_prob, entropy, value

    def get_value(self, x):
        x = x / 255.0  # Normalize pixel values
        x = self.base(x)
        return self.critic(x)

## Training

In [11]:
def train_ppo(config):
    device = torch.device(config.device)
    n_actions = envs_ppo.single_action_space.n
    input_shape = envs_ppo.single_observation_space.shape

    model = PPONetwork(input_shape, n_actions, config.hidden_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, eps=1e-5)

    # Initialize storage
    obs_shape = envs_ppo.single_observation_space.shape
    states = torch.zeros((config.n_steps, config.n_envs) + obs_shape).to(device)
    actions = torch.zeros((config.n_steps, config.n_envs)).to(device)
    rewards = torch.zeros((config.n_steps, config.n_envs)).to(device)
    dones = torch.zeros((config.n_steps, config.n_envs)).to(device)
    log_probs = torch.zeros((config.n_steps, config.n_envs)).to(device)
    values = torch.zeros((config.n_steps, config.n_envs)).to(device)

    # Metrics
    episode_rewards = []
    losses = []
    total_steps = 0
    total_episodes = 0

    # Counters for episodes and rewards since last print
    episodes_since_last_print = 0
    episode_rewards_since_last_print = []

    # Initialize time tracking
    start_time = time.time()
    episode_times = []  # Record the time when each episode ends
    loss_times = []     # Record the time when each loss is calculated
    threshold_times = {0: None, 10: None, 21: None}  # For convergence speed metric

    # Start training
    state, _ = envs_ppo.reset()
    state = torch.tensor(state).to(device)
    done = torch.zeros(config.n_envs).to(device)

    # Episode tracking
    episode_rewards_env = np.zeros(config.n_envs)
    episode_lengths_env = np.zeros(config.n_envs)

    while total_steps < config.total_timesteps:
        current_time = time.time()
        if current_time - start_time > config.time_limit:
            print("Time limit reached. Stopping training.")
            break

        # Anneal learning rate
        if config.anneal_lr:
            frac = 1.0 - (total_steps / config.total_timesteps)
            lr_now = frac * config.learning_rate
            optimizer.param_groups[0]['lr'] = lr_now

        # Collect rollout data
        for step in range(config.n_steps):
            with torch.no_grad():
                action, log_prob, _, value = model(state)
                values[step] = value.squeeze()
            actions[step] = action
            log_probs[step] = log_prob
            states[step] = state
            dones[step] = done

            next_state, reward, terminated, truncated, infos = envs_ppo.step(action.cpu().numpy())
            done = np.logical_or(terminated, truncated)
            rewards[step] = torch.tensor(reward).to(device)
            state = torch.tensor(next_state).to(device)
            done = torch.tensor(done).to(device)

            # Update episode rewards and lengths
            episode_rewards_env += reward
            episode_lengths_env += 1
            for idx, d in enumerate(done):
                if d:
                    # Append rewards to lists
                    episode_rewards.append(episode_rewards_env[idx])
                    episode_rewards_since_last_print.append(episode_rewards_env[idx])

                    # Record episode time
                    elapsed_time = time.time() - start_time
                    episode_times.append(elapsed_time)

                    # Check convergence thresholds
                    cumulative_reward = episode_rewards_env[idx]
                    for threshold in threshold_times:
                        if cumulative_reward >= threshold and threshold_times[threshold] is None:
                            threshold_times[threshold] = elapsed_time

                    # Reset per-environment rewards
                    episode_rewards_env[idx] = 0
                    total_episodes += 1
                    episodes_since_last_print += 1
                    episode_lengths_env[idx] = 0

            total_steps += config.n_envs

        # Compute advantages and returns
        with torch.no_grad():
            next_value = model.get_value(state).squeeze()
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(config.n_steps)):
                if t == config.n_steps - 1:
                    nextnonterminal = 1.0 - done.float()
                    next_values = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1].float()
                    next_values = values[t + 1]
                delta = rewards[t] + config.gamma * next_values * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + config.gamma * config.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values

        # Flatten the batch
        b_states = states.reshape((-1,) + obs_shape)
        b_actions = actions.reshape(-1)
        b_log_probs = log_probs.reshape(-1)
        b_returns = returns.reshape(-1)
        b_advantages = advantages.reshape(-1)
        b_values = values.reshape(-1)

        # Normalize advantages
        if config.norm_advantages:
            b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-8)

        # PPO Update
        inds = np.arange(config.batch_size)
        for epoch in range(config.update_epochs):
            np.random.shuffle(inds)
            for start in range(0, config.batch_size, config.minibatch_size):
                end = start + config.minibatch_size
                mb_inds = inds[start:end]

                mb_states = b_states[mb_inds]
                mb_actions = b_actions.long()[mb_inds]
                mb_old_log_probs = b_log_probs[mb_inds]
                mb_advantages = b_advantages[mb_inds]
                mb_returns = b_returns[mb_inds]
                mb_values = b_values[mb_inds]

                _, new_log_probs, entropy, new_values = model(mb_states, mb_actions)
                new_log_probs = new_log_probs.squeeze()
                entropy = entropy.squeeze()
                new_values = new_values.squeeze()

                # Policy loss
                log_ratio = new_log_probs - mb_old_log_probs
                ratio = log_ratio.exp()
                surr1 = ratio * mb_advantages
                surr2 = torch.clamp(ratio, 1.0 - config.clip_coef, 1.0 + config.clip_coef) * mb_advantages
                policy_loss = -torch.min(surr1, surr2).mean()

                # Value loss
                if config.clip_value_loss:
                    value_pred_clipped = mb_values + (new_values - mb_values).clamp(-config.clip_coef, config.clip_coef)
                    value_losses = (new_values - mb_returns).pow(2)
                    value_losses_clipped = (value_pred_clipped - mb_returns).pow(2)
                    value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean()
                else:
                    value_loss = 0.5 * (new_values - mb_returns).pow(2).mean()

                # Entropy loss
                entropy_loss = entropy.mean()

                # Total loss
                loss = policy_loss + config.weight_value_loss * value_loss - config.weight_ent_loss * entropy_loss
                losses.append(loss.item())

                # Record time at loss calculation
                loss_times.append(time.time() - start_time)

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()

        # Print progress
        if total_steps % (config.n_envs * 10) == 0:
            if len(episode_rewards) > 0:
                mean_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else np.mean(episode_rewards)
                mean_reward_last_episodes = np.mean(episode_rewards_since_last_print) if episodes_since_last_print > 0 else 0
                print(f"Total Steps: {total_steps}, Episodes: {total_episodes}, "
                    f"Mean Reward total: {mean_reward:.2f}, Episodes since last print: {episodes_since_last_print}, "
                    f"Mean Reward Last Episodes: {mean_reward_last_episodes:.2f}")
                
                # Reset the counters
                episodes_since_last_print = 0
                episode_rewards_since_last_print = []
            else:
                print(f"Total Steps: {total_steps}, Episodes: {total_episodes} (No episodes completed yet)")

    total_training_time = time.time() - start_time
    total_episodes = len(episode_rewards)
    timesteps_per_episode = total_steps / total_episodes if total_episodes > 0 else 0
    print("Training complete!")
    print(f"Total training time: {total_training_time:.2f} seconds")
    envs_ppo.close()
    return model, episode_rewards, total_training_time, losses, total_steps, total_episodes, timesteps_per_episode, episode_times, loss_times, threshold_times

In [12]:
ppo_custom_model, ppo_custom_episode_rewards, ppo_custom_total_training_time, ppo_custom_losses, ppo_custom_total_steps, ppo_custom_total_episodes, ppo_custom_timesteps_per_episode, ppo_episode_times, ppo_loss_times, ppo_threshold_times = train_ppo(config_ppo)

Total Steps: 5120, Episodes: 0 (No episodes completed yet)
Total Steps: 10240, Episodes: 8, Mean Reward total: -20.38, Episodes since last print: 8, Mean Reward Last Episodes: -20.38
Total Steps: 15360, Episodes: 14, Mean Reward total: -20.40, Episodes since last print: 6, Mean Reward Last Episodes: -20.50
Total Steps: 20480, Episodes: 17, Mean Reward total: -20.30, Episodes since last print: 3, Mean Reward Last Episodes: -20.33
Total Steps: 25600, Episodes: 24, Mean Reward total: -20.00, Episodes since last print: 7, Mean Reward Last Episodes: -19.86
Total Steps: 30720, Episodes: 30, Mean Reward total: -20.20, Episodes since last print: 6, Mean Reward Last Episodes: -20.50
Total Steps: 35840, Episodes: 34, Mean Reward total: -20.30, Episodes since last print: 4, Mean Reward Last Episodes: -20.00
Total Steps: 40960, Episodes: 41, Mean Reward total: -20.50, Episodes since last print: 7, Mean Reward Last Episodes: -20.57
Total Steps: 46080, Episodes: 45, Mean Reward total: -20.60, Episod

## Evaluation

In [13]:
def evaluate_model_ppo(model, config, num_episodes=10):
    env = gym.make(config.gym_id, render_mode='rgb_array', frameskip=1)
    env = AtariWrappers(env, config.frame_skip)
    env.reset(seed=config.seed)
    env.action_space.seed(config.seed)
    model.eval()
    device = torch.device(config.device)

    total_rewards = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = torch.tensor(state, device=device)
        done = False
        episode_reward = 0
        while not done:
            with torch.no_grad():
                action, _, _, _ = model(state.unsqueeze(0))
                action = action.item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            state = torch.tensor(next_state, device=device)
        total_rewards.append(episode_reward)
    mean_reward = np.mean(total_rewards)
    std_reward = np.std(total_rewards)
    env.close()
    model.train()
    return mean_reward, std_reward

In [14]:
ppo_custom_mean_reward, ppo_custom_std_reward = evaluate_model_ppo(ppo_custom_model, config_ppo)
print(f"PPO Evaluation over 10 episodes: Mean Reward = {ppo_custom_mean_reward}, Std Reward = {ppo_custom_std_reward}")

  state = torch.tensor(state, device=device)


PPO Evaluation over 10 episodes: Mean Reward = 19.5, Std Reward = 1.6881943016134133


##  Metrics

In [15]:
results_ppo = {
    'Episode Rewards': ppo_custom_episode_rewards,
    'Episode Times': ppo_episode_times,
    'Losses': ppo_custom_losses,
    'Loss Times': ppo_loss_times,
    'Threshold Times': ppo_threshold_times,
    'Total Training Time': ppo_custom_total_training_time,
    'Total Steps': ppo_custom_total_steps,
    'Total Episodes': ppo_custom_total_episodes,
    'Timesteps per Episode': ppo_custom_timesteps_per_episode,
    'Mean Evaluation Reward': ppo_custom_mean_reward,
    'Std Evaluation Reward': ppo_custom_std_reward
}

# Save results to JSON
ppo_metrics_path = f"./metrics/ppo_custom_metrics_{env_id}_{training_time_max}.json"
with open(ppo_metrics_path, 'w') as f:
    json.dump(results_ppo, f)

# Print Summary
print("=== Training Summary ===")
print(f"Total Training Time: {ppo_custom_total_training_time:.2f} seconds")
print(f"Total Steps: {ppo_custom_total_steps}")
print(f"Total Episodes: {ppo_custom_total_episodes}")
print(f"Average Timesteps per Episode: {ppo_custom_timesteps_per_episode:.2f}")
print(f"Final Mean Reward (last 10 episodes): {np.mean(ppo_custom_episode_rewards[-10:]):.2f}")
print(f"Evaluation Mean Reward: {ppo_custom_mean_reward:.2f}")
print(f"Evaluation Std Reward: {ppo_custom_std_reward:.2f}")
print(f"Threshold Times: {ppo_threshold_times}")

=== Training Summary ===
Total Training Time: 32401.68 seconds
Total Steps: 8514560
Total Episodes: 4074
Average Timesteps per Episode: 2089.98
Final Mean Reward (last 10 episodes): 18.20
Evaluation Mean Reward: 19.50
Evaluation Std Reward: 1.69
Threshold Times: {0: 4006.7676961421967, 10: 4166.020635128021, 21: 6754.772814035416}


## Save Model

In [16]:
# Save the Model
ppo_model_path = f"./models/ppo_custom_model_{env_id}_{training_time_max}.pth"
ppo_repo_path = f"ppo_custom_model_{env_id}_{training_time_max}.pth"
ppo_repo_path_metrics = f"ppo_custom_metrics_{env_id}_{training_time_max}.json"
torch.save(ppo_custom_model.state_dict(), ppo_model_path)

upload_file(
    path_or_fileobj=ppo_model_path,
    path_in_repo=ppo_repo_path,
    repo_id='maxstahl/ppo_pongnoframskip_v4_custom',
)

upload_file(
    path_or_fileobj=ppo_metrics_path,
    path_in_repo=ppo_repo_path_metrics,
    repo_id='maxstahl/ppo_pongnoframskip_v4_custom',
)

  self.comm = Comm(**args)


ppo_custom_model_PongNoFrameskip-v4_32400.pth:   0%|          | 0.00/6.76M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/maxstahl/ppo_pongnoframskip_v4_custom/commit/82858ef4d262f25f091a8f45abd8f62e009a6e06', commit_message='Upload ppo_custom_metrics_PongNoFrameskip-v4_32400.json with huggingface_hub', commit_description='', oid='82858ef4d262f25f091a8f45abd8f62e009a6e06', pr_url=None, pr_revision=None, pr_num=None)

# A2C Implementation

## Configuration

In [17]:
config_dict_a2c = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'gym_id': env_id,
    'total_timesteps': 1e17,
    'n_envs': 8,
    'n_steps': 128,  # Number of steps per rollout in A2C
    'frame_skip': 4,
    'hidden_size': 512,
    'learning_rate': 7e-4,
    'gamma': 0.99,
    'gae': True,
    'gae_lambda': 0.95,  # Not used when GAE is False
    'entropy_coef': 0.01,
    'value_loss_coef': 0.25,
    'max_grad_norm': 0.5,
    'normalize_advantage': True,
    'time_limit': training_time_max,  # 60 seconds for testing
    'seed': seed
}

# Convert to Config Class
config_a2c = Config(config_dict_a2c)

## Environment

In [18]:
envs_a2c = gym.vector.AsyncVectorEnv([
    make_env(config_a2c.gym_id, seed + i, config_a2c.frame_skip, clip_rewards=True) for i in range(config_a2c.n_envs)
])

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


## Neural Network

In [19]:
class A2CNetwork(nn.Module):
    def __init__(self, input_shape, n_actions, hidden_size=512):
        super().__init__()
        self.base = nn.Sequential(
            layer_init(nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64 * 7 * 7, hidden_size)),
            nn.ReLU()
        )
        self.actor = layer_init(nn.Linear(hidden_size, n_actions), std=0.01)
        self.critic = layer_init(nn.Linear(hidden_size, 1), std=1.0)

    def forward(self, x):
        x = x / 255.0  # Normalize pixel values
        x = self.base(x)
        logits = self.actor(x)
        value = self.critic(x).squeeze(-1)
        return logits, value

## Training

In [20]:
def train_a2c(config):
    device = torch.device(config.device)
    n_actions = envs_a2c.single_action_space.n
    input_shape = envs_a2c.single_observation_space.shape

    model = A2CNetwork(input_shape, n_actions, config.hidden_size).to(device)
    optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate, alpha=0.99, eps=1e-5)

    # Initialize storage
    obs_shape = envs_a2c.single_observation_space.shape
    states = torch.zeros((config.n_steps, config.n_envs) + obs_shape).to(device)
    actions = torch.zeros((config.n_steps, config.n_envs)).to(device)
    rewards = torch.zeros((config.n_steps, config.n_envs)).to(device)
    dones = torch.zeros((config.n_steps, config.n_envs)).to(device)
    values = torch.zeros((config.n_steps, config.n_envs)).to(device)

    # Metrics
    episode_rewards = []
    losses = []
    total_steps = 0
    total_episodes = 0

    # Initialize time tracking
    start_time = time.time()
    episode_times = []  # Record the time when each episode ends
    loss_times = []     # Record the time when each loss is calculated
    threshold_times = {0: None, 10: None, 21: None}  # For convergence speed metric

    # Start training
    state, _ = envs_a2c.reset()
    state = torch.tensor(state).to(device)
    done = torch.zeros(config.n_envs).to(device)

    # Episode tracking
    episode_rewards_env = np.zeros(config.n_envs)
    episode_lengths_env = np.zeros(config.n_envs)

    while total_steps < config.total_timesteps:
        current_time = time.time()
        if current_time - start_time > config.time_limit:
            print("Time limit reached. Stopping training.")
            break

        # Collect rollout data
        for step in range(config.n_steps):
            with torch.no_grad():
                logits, value = model(state)
                probs = torch.distributions.Categorical(logits=logits)
                action = probs.sample()
            log_prob = probs.log_prob(action)
            entropy = probs.entropy()
            values[step] = value
            actions[step] = action
            states[step] = state
            dones[step] = done

            next_state, reward, terminated, truncated, infos = envs_a2c.step(action.cpu().numpy())
            done = np.logical_or(terminated, truncated)
            rewards[step] = torch.tensor(reward).to(device)
            state = torch.tensor(next_state).to(device)
            done = torch.tensor(done).to(device)

            # Update episode rewards and lengths
            episode_rewards_env += reward
            episode_lengths_env += 1
            for idx, d in enumerate(done):
                if d:
                    episode_rewards.append(episode_rewards_env[idx])

                    # Record episode time
                    elapsed_time = time.time() - start_time
                    episode_times.append(elapsed_time)

                    # Check convergence thresholds
                    cumulative_reward = episode_rewards_env[idx]
                    for threshold in threshold_times:
                        if cumulative_reward >= threshold and threshold_times[threshold] is None:
                            threshold_times[threshold] = elapsed_time

                    episode_rewards_env[idx] = 0
                    total_episodes += 1
                    episode_lengths_env[idx] = 0

            total_steps += config.n_envs

        # Compute advantages and returns using GAE
        with torch.no_grad():
            next_value = model(state)[1]
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(config.n_steps)):
                if t == config.n_steps - 1:
                    nextnonterminal = 1.0 - done.float()
                    next_values = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1].float()
                    next_values = values[t + 1]
                delta = rewards[t] + config.gamma * next_values * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + config.gamma * config.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values

        # Flatten the batch
        b_states = states.reshape((-1,) + obs_shape)
        b_actions = actions.reshape(-1)
        b_returns = returns.reshape(-1)
        b_advantages = advantages.reshape(-1)

        # Normalize advantages
        if config.normalize_advantage:
            b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-8)

        # Recompute logits and values
        logits, values_pred = model(b_states)
        values_pred = values_pred 
        probs = torch.distributions.Categorical(logits=logits)
        log_probs = probs.log_prob(b_actions.long())
        entropy = probs.entropy().mean()

        # Compute losses
        policy_loss = -(b_advantages * log_probs).mean()
        value_loss = nn.functional.mse_loss(values_pred, b_returns)
        entropy_loss = entropy

        loss = policy_loss + config.value_loss_coef * value_loss - config.entropy_coef * entropy_loss
        losses.append(loss.item())

        # Record time at loss calculation
        loss_times.append(time.time() - start_time)

        # Optimize
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
        optimizer.step()

        # Print progress
        if total_steps % (config.n_envs * 1000) == 0:
            mean_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else np.mean(episode_rewards)
            print(f"Total Steps: {total_steps}, Episodes: {total_episodes}, Mean Reward: {mean_reward:.2f}")

    total_training_time = time.time() - start_time
    timesteps_per_episode = total_steps / total_episodes if total_episodes > 0 else 0
    print("Training complete!")
    print(f"Total training time: {total_training_time:.2f} seconds")
    envs_a2c.close()
    return model, episode_rewards, total_training_time, losses, total_steps, total_episodes, timesteps_per_episode, episode_times, loss_times, threshold_times

In [21]:
a2c_custom_model, a2c_custom_episode_rewards, a2c_custom_total_training_time, a2c_custom_losses, a2c_custom_total_steps, a2c_custom_total_episodes, a2c_custom_timesteps_per_episode, a2c_episode_times, a2c_loss_times, a2c_threshold_times = train_a2c(config_a2c)

Total Steps: 128000, Episodes: 136, Mean Reward: -19.90
Total Steps: 256000, Episodes: 270, Mean Reward: -20.00
Total Steps: 384000, Episodes: 408, Mean Reward: -20.90
Total Steps: 512000, Episodes: 542, Mean Reward: -19.10
Total Steps: 640000, Episodes: 681, Mean Reward: -20.30
Total Steps: 768000, Episodes: 820, Mean Reward: -20.30
Total Steps: 896000, Episodes: 959, Mean Reward: -20.20
Total Steps: 1024000, Episodes: 1098, Mean Reward: -20.50
Total Steps: 1152000, Episodes: 1237, Mean Reward: -20.40
Total Steps: 1280000, Episodes: 1376, Mean Reward: -20.30
Total Steps: 1408000, Episodes: 1514, Mean Reward: -20.70
Total Steps: 1536000, Episodes: 1649, Mean Reward: -20.10
Total Steps: 1664000, Episodes: 1784, Mean Reward: -19.90
Total Steps: 1792000, Episodes: 1916, Mean Reward: -19.90
Total Steps: 1920000, Episodes: 2044, Mean Reward: -19.90
Total Steps: 2048000, Episodes: 2164, Mean Reward: -19.20
Total Steps: 2176000, Episodes: 2268, Mean Reward: -18.70
Total Steps: 2304000, Episod

## Evaluation

In [22]:
def evaluate_model_a2c(model, config, num_episodes=10):
    env = gym.make(config.gym_id, render_mode='rgb_array', frameskip=1)
    env = AtariWrappers(env, config.frame_skip)
    env.reset(seed=config.seed)
    env.action_space.seed(config.seed)
    model.eval()
    device = torch.device(config.device)

    total_rewards = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = torch.tensor(state, device=device)
        done = False
        episode_reward = 0
        while not done:
            with torch.no_grad():
                logits, _ = model(state.unsqueeze(0))
                probs = torch.distributions.Categorical(logits=logits)
                action = probs.sample().item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            state = torch.tensor(next_state, device=device)
        total_rewards.append(episode_reward)
    mean_reward = np.mean(total_rewards)
    std_reward = np.std(total_rewards)
    env.close()
    model.train()
    return mean_reward, std_reward

In [23]:
a2c_custom_mean_reward, a2c_custom_std_reward = evaluate_model_a2c(a2c_custom_model, config_a2c)
print(f"A2C Evaluation over 10 episodes: Mean Reward = {a2c_custom_mean_reward}, Std Reward = {a2c_custom_std_reward}")

A2C Evaluation over 10 episodes: Mean Reward = 20.7, Std Reward = 0.9


## Metrics

In [24]:
results_a2c = {
  'Episode Rewards': a2c_custom_episode_rewards,
  'Episode Times': a2c_episode_times,
  'Losses': a2c_custom_losses,
  'Loss Times': a2c_loss_times,
  'Threshold Times': a2c_threshold_times,
  'Total Training Time': a2c_custom_total_training_time,
  'Total Steps': a2c_custom_total_steps,
  'Total Episodes': a2c_custom_total_episodes,
  'Timesteps per Episode': a2c_custom_timesteps_per_episode,
  'Mean Evaluation Reward': a2c_custom_mean_reward,
  'Std Evaluation Reward': a2c_custom_std_reward
}

a2c_metrics_path =f"./metrics/a2c_custom_metrics_{env_id}_{training_time_max}.json"
with open(a2c_metrics_path, 'w') as f:
  json.dump(results_a2c, f)

# Print Summary
print("=== Training Summary ===")
print(f"Total Training Time: {a2c_custom_total_training_time:.2f} seconds")
print(f"Total Steps: {a2c_custom_total_steps}")
print(f"Total Episodes: {a2c_custom_total_episodes}")
print(f"Average Timesteps per Episode: {a2c_custom_timesteps_per_episode:.2f}")
print(f"Final Mean Reward (last 10 episodes): {np.mean(a2c_custom_episode_rewards[-10:]):.2f}")
print(f"Evaluation Mean Reward: {a2c_custom_mean_reward:.2f}")
print(f"Evaluation Std Reward: {a2c_custom_std_reward:.2f}")
print(f"Threshold Times: {a2c_threshold_times}")

=== Training Summary ===
Total Training Time: 32400.47 seconds
Total Steps: 27022336
Total Episodes: 12929
Average Timesteps per Episode: 2090.06
Final Mean Reward (last 10 episodes): 21.00
Evaluation Mean Reward: 20.70
Evaluation Std Reward: 0.90
Threshold Times: {0: 5608.170121192932, 10: 6153.035921096802, 21: 6583.0956292152405}


## Save Model

In [25]:
# Save the Model
a2c_model_path = f"./models/a2c_custom_model_{env_id}_{training_time_max}.pth"
a2c_repo_path = f"a2c_custom_model_{env_id}_{training_time_max}.pth"
a2c_repo_path_metrics = f"a2c_custom_metrics_{env_id}_{training_time_max}.json"
torch.save(a2c_custom_model.state_dict(), a2c_model_path)

upload_file(
    path_or_fileobj=a2c_model_path,
    path_in_repo=a2c_repo_path,
    repo_id='maxstahl/a2c_pongnoframskip_v4_custom',
)

upload_file(
    path_or_fileobj=a2c_metrics_path,
    path_in_repo=a2c_repo_path_metrics,
    repo_id='maxstahl/a2c_pongnoframskip_v4_custom',
)

a2c_custom_model_PongNoFrameskip-v4_32400.pth:   0%|          | 0.00/6.76M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/maxstahl/a2c_pongnoframskip_v4_custom/commit/64c1da7d0fe24c06fa736bb04a81eaff701f33b9', commit_message='Upload a2c_custom_metrics_PongNoFrameskip-v4_32400.json with huggingface_hub', commit_description='', oid='64c1da7d0fe24c06fa736bb04a81eaff701f33b9', pr_url=None, pr_revision=None, pr_num=None)

# DQN Implementation

## Configurations

In [26]:
config_dict_dqn = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'gym_id': env_id,
    'total_timesteps': 1e7,
    'frame_skip': 4,
    'hidden_size': 512,
    'learning_rate': 1e-4,
    'gamma': 0.99,
    'buffer_size': 750000,        # Replay buffer size
    'batch_size': 32,             # Minibatch size for sampling
    'epsilon_start': 1.0,         # Initial epsilon for epsilon-greedy
    'epsilon_final': 0.01,        # Final epsilon
    'epsilon_decay': 1e6,         # Number of steps to decay epsilon
    'target_update_freq': 1000,   # Frequency (in steps) to update target network
    'train_freq': 4,              # Frequency (in steps) to train the network
    'learning_starts': 50000,     # Number of steps before starting training 
    'max_grad_norm': 0.5,         # Maximum gradient norm
    'time_limit': training_time_max,  
    'seed': seed
}

# Convert to Config Class
config_dqn = Config(config_dict_dqn)

## Environment

In [27]:
env_dqn = make_env(config_dqn.gym_id, config_dqn.seed, config_dqn.frame_skip, clip_rewards=False)()

## Neural Network

In [28]:
class DQNNetwork(nn.Module):
    def __init__(self, input_shape, n_actions, hidden_size=512):
        super().__init__()
        self.conv = nn.Sequential(
            layer_init(nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
            nn.ReLU(),
        )
        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            layer_init(nn.Linear(conv_out_size, hidden_size)),
            nn.ReLU(),
            layer_init(nn.Linear(hidden_size, n_actions), std=0.01)
        )
      
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
      
    def forward(self, x):
        x = x / 255.0  # Normalize pixel values
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

## Replay Buffer

In [29]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
      
    def push(self, state, action, reward, next_state, done):
        # Store experience as a tuple
        self.buffer.append((state, action, reward, next_state, done))
      
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        # Convert to appropriate types
        return (
            torch.tensor(state, dtype=torch.float32),
            torch.tensor(action, dtype=torch.long),
            torch.tensor(reward, dtype=torch.float32),
            torch.tensor(next_state, dtype=torch.float32),
            torch.tensor(done, dtype=torch.float32),
        )
      
    def __len__(self):
        return len(self.buffer)

## Training

In [30]:
def train_dqn(config):
    device = torch.device(config.device)
    n_actions = env_dqn.action_space.n
    input_shape = env_dqn.observation_space.shape

    # Initialize networks
    policy_net = DQNNetwork(input_shape, n_actions, config.hidden_size).to(device)
    target_net = DQNNetwork(input_shape, n_actions, config.hidden_size).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=config.learning_rate)

    # Initialize replay buffer with specified capacity
    replay_buffer = ReplayBuffer(int(config.buffer_size))

    # Metrics
    episode_rewards = []
    losses = []
    total_steps = 0
    total_episodes = 0

    # Counters for episodes and rewards since last print
    episodes_since_last_print = 0
    episode_rewards_since_last_print = []

    # Initialize time tracking variables
    start_time = time.time()
    episode_times = []  # Record the time when each episode ends
    loss_times = []     # Record the time when each loss is calculated
    threshold_times = {0: None, 10: None, 21: None}  # For convergence speed metric

    # Epsilon schedule function
    def epsilon_by_step(step):
        epsilon = config.epsilon_final + (config.epsilon_start - config.epsilon_final) * \
            max(0, (config.epsilon_decay - step) / config.epsilon_decay)
        return epsilon

    # Start training
    start_time = time.time()
    state, _ = env_dqn.reset()
    state = np.array(state)
    episode_reward = 0
    done = False

    while total_steps < config.total_timesteps:
        current_time = time.time()
        if current_time - start_time > config.time_limit:
            print("Time limit reached. Stopping training.")
            break

        epsilon = epsilon_by_step(total_steps)
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = env_dqn.action_space.sample()
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                q_values = policy_net(state_tensor)
                action = q_values.argmax().item()
        
        next_state, reward, terminated, truncated, info = env_dqn.step(action)
        done = terminated or truncated
        episode_reward += reward
        total_steps += 1

        # Push experience to replay buffer
        replay_buffer.push(state, action, reward, next_state, done)
        state = np.array(next_state)

        # If done, reset the environment
        if done:
            episode_rewards.append(episode_reward)
            episode_rewards_since_last_print.append(episode_reward)

            # Record episode time
            elapsed_time = time.time() - start_time
            episode_times.append(elapsed_time)

            # Check convergence thresholds
            cumulative_reward = episode_reward
            for threshold in threshold_times:
                if cumulative_reward >= threshold and threshold_times[threshold] is None:
                    threshold_times[threshold] = elapsed_time

            episode_reward = 0
            total_episodes += 1
            episodes_since_last_print += 1
            state, _ = env_dqn.reset()
            state = np.array(state)

        # Start training only after collecting sufficient data
        if total_steps > config.learning_starts and total_steps % config.train_freq == 0:
            if len(replay_buffer) > config.batch_size:
                # Sample a batch
                batch_state, batch_action, batch_reward, batch_next_state, batch_done = replay_buffer.sample(config.batch_size)
                batch_state = batch_state.to(device)
                batch_action = batch_action.to(device)
                batch_reward = batch_reward.to(device)
                batch_next_state = batch_next_state.to(device)
                batch_done = batch_done.to(device)
                
                # Compute current Q values
                current_q_values = policy_net(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
                
                # Compute target Q values using standard DQN logic
                with torch.no_grad():
                    next_q_values = target_net(batch_next_state).max(1)[0]
                    target_q_values = batch_reward + config.gamma * next_q_values * (1 - batch_done)
                
                # Compute loss
                loss = nn.functional.mse_loss(current_q_values, target_q_values)
                losses.append(loss.item())

                # Record time at loss calculation
                loss_times.append(time.time() - start_time)
                
                # Optimize the model
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(policy_net.parameters(), config.max_grad_norm)
                optimizer.step()

                # Update target network periodically
                if total_steps % config.target_update_freq == 0:
                    target_net.load_state_dict(policy_net.state_dict())
        
        # Print progress
        if total_steps % 10000 == 0:
            mean_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else np.mean(episode_rewards)
            if episodes_since_last_print > 0:
                mean_reward_last_episodes = np.mean(episode_rewards_since_last_print)
            else:
                mean_reward_last_episodes = 0
            print(f"Total Steps: {total_steps}, Episodes: {total_episodes}, Mean Reward total: {mean_reward:.2f}, Episodes since last print: {episodes_since_last_print}, Mean Reward Last Episodes: {mean_reward_last_episodes:.2f}")
            # Reset the counters
            episodes_since_last_print = 0
            episode_rewards_since_last_print = []

    total_training_time = time.time() - start_time
    timesteps_per_episode = total_steps / total_episodes if total_episodes > 0 else 0
    print("Training complete!")
    print(f"Total training time: {total_training_time:.2f} seconds")
    env_dqn.close()
    return policy_net, episode_rewards, total_training_time, losses, total_steps, total_episodes, timesteps_per_episode, episode_times, loss_times, threshold_times


In [32]:
dqn_custom_model, dqn_custom_episode_rewards, dqn_custom_total_training_time, dqn_custom_losses, dqn_custom_total_steps, dqn_custom_total_episodes, dqn_custom_timesteps_per_episode, dqn_episode_times, dqn_loss_times, dqn_threshold_times = train_dqn(config_dqn)

Total Steps: 10000, Episodes: 10, Mean Reward total: -20.40, Episodes since last print: 10, Mean Reward Last Episodes: -20.40
Total Steps: 20000, Episodes: 21, Mean Reward total: -20.80, Episodes since last print: 11, Mean Reward Last Episodes: -20.73
Total Steps: 30000, Episodes: 32, Mean Reward total: -20.20, Episodes since last print: 11, Mean Reward Last Episodes: -20.18
Total Steps: 40000, Episodes: 43, Mean Reward total: -20.60, Episodes since last print: 11, Mean Reward Last Episodes: -20.64
Total Steps: 50000, Episodes: 53, Mean Reward total: -19.70, Episodes since last print: 10, Mean Reward Last Episodes: -19.70
Total Steps: 60000, Episodes: 65, Mean Reward total: -20.20, Episodes since last print: 12, Mean Reward Last Episodes: -20.17
Total Steps: 70000, Episodes: 76, Mean Reward total: -20.40, Episodes since last print: 11, Mean Reward Last Episodes: -20.45
Total Steps: 80000, Episodes: 86, Mean Reward total: -20.40, Episodes since last print: 10, Mean Reward Last Episodes:

## Evaluation

In [33]:
def evaluate_model_dqn(model, config, num_episodes=10):
    env = gym.make(config.gym_id, render_mode='rgb_array', frameskip=1)
    env = AtariWrappers(env, config.frame_skip)
    env.reset(seed=config.seed)
    env.action_space.seed(config.seed)
    model.eval()
    device = torch.device(config.device)

    total_rewards = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = np.array(state)
        done = False
        episode_reward = 0
        while not done:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                q_values = model(state_tensor)
                action = q_values.argmax().item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            state = np.array(next_state)
        total_rewards.append(episode_reward)
    mean_reward = np.mean(total_rewards)
    std_reward = np.std(total_rewards)
    env.close()
    model.train()
    return mean_reward, std_reward

In [34]:
dqn_custom_mean_reward, dqn_custom_std_reward = evaluate_model_dqn(dqn_custom_model, config_dqn)
print(f"DQN Evaluation over 10 episodes: Mean Reward = {dqn_custom_mean_reward}, Std Reward = {dqn_custom_std_reward}")

DQN Evaluation over 10 episodes: Mean Reward = 18.7, Std Reward = 0.45825756949558394


## Metrics

In [35]:
results_dqn = {
  'Episode Rewards': dqn_custom_episode_rewards,
  'Episode Times': dqn_episode_times,
  'Losses': dqn_custom_losses,
  'Loss Times': dqn_loss_times,
  'Threshold Times': dqn_threshold_times,
  'Total Training Time': dqn_custom_total_training_time,
  'Total Steps': dqn_custom_total_steps,
  'Total Episodes': dqn_custom_total_episodes,
  'Timesteps per Episode': dqn_custom_timesteps_per_episode,
  'Mean Evaluation Reward': dqn_custom_mean_reward,
  'Std Evaluation Reward': dqn_custom_std_reward
}

dqn_metrics_path = f"./metrics/dqn_custom_metrics_{env_id}_{training_time_max}.json"
with open(dqn_metrics_path, 'w') as f:
  json.dump(results_dqn, f)

# Print Summary
print("=== Training Summary ===")
print(f"Total Training Time: {dqn_custom_total_training_time:.2f} seconds")
print(f"Total Steps: {dqn_custom_total_steps}")
print(f"Total Episodes: {dqn_custom_total_episodes}")
print(f"Average Timesteps per Episode: {dqn_custom_timesteps_per_episode:.2f}")
print(f"Final Mean Reward (last 10 episodes): {np.mean(dqn_custom_episode_rewards[-10:]):.2f}")
print(f"Evaluation Mean Reward: {dqn_custom_mean_reward:.2f}")
print(f"Evaluation Std Reward: {dqn_custom_std_reward:.2f}")
print(f"Threshold Times: {dqn_threshold_times}")

=== Training Summary ===
Total Training Time: 32400.04 seconds
Total Steps: 2155108
Total Episodes: 1197
Average Timesteps per Episode: 1800.42
Final Mean Reward (last 10 episodes): 19.40
Evaluation Mean Reward: 18.70
Evaluation Std Reward: 0.46
Threshold Times: {0: 11609.60074019432, 10: 13503.442669153214, 21: 21429.59261894226}


## Save Model

In [36]:
# Save the Model
dqn_model_path = f"./models/dqn_custom_model_{env_id}_{training_time_max}.pth"
dqn_repo_path = f"dqn_custom_model_{env_id}_{training_time_max}.pth"
dqn_repo_path_metrics = f"dqn_custom_metrics_{env_id}_{training_time_max}.json"
torch.save(dqn_custom_model.state_dict(), dqn_model_path)

upload_file(
    path_or_fileobj=dqn_model_path,
    path_in_repo=dqn_repo_path,
    repo_id='maxstahl/dqn_pongnoframskip_v4_custom',
)

upload_file(
    path_or_fileobj=dqn_metrics_path,
    path_in_repo=dqn_repo_path_metrics,
    repo_id='maxstahl/dqn_pongnoframskip_v4_custom',
)

dqn_custom_model_PongNoFrameskip-v4_32400.pth:   0%|          | 0.00/6.75M [00:00<?, ?B/s]

dqn_custom_metrics_PongNoFrameskip-v4_32400.json:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/maxstahl/dqn_pongnoframskip_v4_custom/commit/ef9ca69ce65b767da4cccabfda335e754790218d', commit_message='Upload dqn_custom_metrics_PongNoFrameskip-v4_32400.json with huggingface_hub', commit_description='', oid='ef9ca69ce65b767da4cccabfda335e754790218d', pr_url=None, pr_revision=None, pr_num=None)

# Summary

In [37]:
summary_data = {
  'Model': ['PPO', 'A2C', 'DQN'],
  'Mean Reward': [ppo_custom_mean_reward, a2c_custom_mean_reward, dqn_custom_mean_reward],
  'Std Reward': [ppo_custom_std_reward, a2c_custom_std_reward, dqn_custom_std_reward],
  'Training Time (s)': [ppo_custom_total_training_time, a2c_custom_total_training_time, dqn_custom_total_training_time],
  'Total Steps': [ppo_custom_total_steps, a2c_custom_total_steps, dqn_custom_total_steps],
  'Total Episodes': [ppo_custom_total_episodes, a2c_custom_total_episodes, dqn_custom_total_episodes],
  'Timesteps per Episode': [ppo_custom_timesteps_per_episode, a2c_custom_timesteps_per_episode, dqn_custom_timesteps_per_episode]
}

summary_df = pd.DataFrame(summary_data)

display(summary_df)

Unnamed: 0,Model,Mean Reward,Std Reward,Training Time (s),Total Steps,Total Episodes,Timesteps per Episode
0,PPO,19.5,1.688194,32401.683105,8514560,4074,2089.975454
1,A2C,20.7,0.9,32400.472499,27022336,12929,2090.056153
2,DQN,18.7,0.458258,32400.043806,2155108,1197,1800.424394
