# 1. Imports

In [1]:
from huggingface_hub import hf_hub_download
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.atari_wrappers import AtariWrapper
from stable_baselines3.common.monitor import Monitor

import gymnasium as gym
import torch
import numpy as np
import warnings
import seaborn as sns
import os
import json
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# 2. Load Trained Models

In [3]:
sb3_models_info = [
    # DQN Models
    {
        "algorithm": "DQN",
        "model_name": "dqn_model_10800",
        "repo_id": "maxstahl/dqn_pongnoframskip_v4_sb3",
        "filename": "dqn_PongNoFrameskip-v4_10800.zip"
    },
    {
        "algorithm": "DQN",
        "model_name": "dqn_model_21600",
        "repo_id": "maxstahl/dqn_pongnoframskip_v4_sb3",
        "filename": "dqn_PongNoFrameskip-v4_21600.zip"
    },
    {
        "algorithm": "DQN",
        "model_name": "dqn_model_32400",
        "repo_id": "maxstahl/dqn_pongnoframskip_v4_sb3",
        "filename": "dqn_PongNoFrameskip-v4_32400.zip"
    },
    # A2C Models
    {
        "algorithm": "A2C",
        "model_name": "a2c_model_10800",
        "repo_id": "maxstahl/a2c_pongnoframskip_v4_sb3",
        "filename": "a2c_PongNoFrameskip-v4_10800.zip"
    },
    {
        "algorithm": "A2C",
        "model_name": "a2c_model_21600",
        "repo_id": "maxstahl/a2c_pongnoframskip_v4_sb3",
        "filename": "a2c_PongNoFrameskip-v4_21600.zip"
    },
    {
        "algorithm": "A2C",
        "model_name": "a2c_model_32400",
        "repo_id": "maxstahl/a2c_pongnoframskip_v4_sb3",
        "filename": "a2c_PongNoFrameskip-v4_32400.zip"
    },
    # PPO Models
    {
        "algorithm": "PPO",
        "model_name": "ppo_model_10800",
        "repo_id": "maxstahl/ppo_pongnoframskip_v4_sb3",
        "filename": "ppo_PongNoFrameskip-v4_10800.zip"
    },
    {
        "algorithm": "PPO",
        "model_name": "ppo_model_21600",
        "repo_id": "maxstahl/ppo_pongnoframskip_v4_sb3",
        "filename": "ppo_PongNoFrameskip-v4_21600.zip"
    },
    {
        "algorithm": "PPO",
        "model_name": "ppo_model_32400",
        "repo_id": "maxstahl/ppo_pongnoframskip_v4_sb3",
        "filename": "ppo_PongNoFrameskip-v4_32400.zip"
    }
]

In [4]:
# Initialize an empty list to store loaded models
loaded_sb3_models = []

# Iterate over each model info and load the models
for model_info in sb3_models_info:
    print(f"Loading model: {model_info['model_name']} ({model_info['algorithm']})")
    model_path = hf_hub_download(
        repo_id=model_info["repo_id"],
        filename=model_info["filename"]
    )
    if model_info["algorithm"] == "DQN":
        model = DQN.load(model_path, device='cpu')  # Change 'cpu' to 'cuda' if using GPU
    elif model_info["algorithm"] == "A2C":
        model = A2C.load(model_path, device='cpu')
    elif model_info["algorithm"] == "PPO":
        model = PPO.load(model_path, device='cpu')
    else:
        raise ValueError(f"Unsupported algorithm: {model_info['algorithm']}")
    
    # Append the loaded model along with its info to the list
    loaded_sb3_models.append({
        "algorithm": model_info["algorithm"],
        "model_name": model_info["model_name"],
        "model": model
    })

Loading model: dqn_model_10800 (DQN)


  self.comm = Comm(**args)


dqn_PongNoFrameskip-v4_10800.zip:   0%|          | 0.00/27.0M [00:00<?, ?B/s]

Loading model: dqn_model_21600 (DQN)
Loading model: dqn_model_32400 (DQN)
Loading model: a2c_model_10800 (A2C)
Loading model: a2c_model_21600 (A2C)
Loading model: a2c_model_32400 (A2C)
Loading model: ppo_model_10800 (PPO)


  self.comm = Comm(**args)


ppo_PongNoFrameskip-v4_10800.zip:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Loading model: ppo_model_21600 (PPO)
Loading model: ppo_model_32400 (PPO)


# 3. Evaluation Fuctions

In [10]:
def make_eval_env(env_id, seed):
    eval_env = gym.make(env_id, render_mode="rgb_array")
    eval_env = AtariWrapper(eval_env, clip_reward=False, terminal_on_life_loss=False)
    eval_env = Monitor(eval_env)
    eval_env.seed(seed)
    eval_env.action_space.seed(seed)
    return eval_env

In [11]:
def evaluate_and_save(model, env, num_episodes, save_path):
    """
    Evaluate an SB3 model and save the metrics to a JSON file.

    Args:
        model (BaseAlgorithm): The SB3 model to evaluate.
        env (VecEnv): The environment to evaluate on.
        num_episodes (int): Number of episodes to evaluate.
        save_path (str): Path to save the JSON metrics.

    Returns:
        dict: Dictionary containing per-episode rewards, mean reward, and std reward.
    """
    # Initialize list to store rewards
    total_rewards = []

    for episode in range(1, num_episodes + 1):
        obs = env.reset()[0]  # Reset and get the initial observation
        done = False
        episode_reward = 0

        while not done:
            # Predict action using the model
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

        total_rewards.append(episode_reward)
        print(f"Episode {episode}: Reward = {episode_reward}")

    # Calculate mean and standard deviation
    mean_reward = np.mean(total_rewards)
    std_reward = np.std(total_rewards)

    # Prepare metrics dictionary
    metrics = {
        'per_episode_rewards': total_rewards,
        'mean_reward': mean_reward,
        'std_reward': std_reward
    }

    # Ensure the directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Save metrics to JSON
    with open(save_path, 'w') as f:
        json.dump(metrics, f, indent=4)

    return metrics

# 4. Evaluation

In [15]:
#env_id = "PongNoFrameskip-v4"
env_id = "ALE/Pong-v5"
seed = 73

# Create the environment
env = make_eval_env(env_id=env_id, seed=seed)

In [16]:
# Define the number of evaluation episodes
NUM_EPISODES = 10

# Define the base directory for saving metrics
base_metrics_dir = './metrics/sb3_eval/test'

# Initialize a list to store evaluation results for the summary table
evaluation_results = []

# Iterate through each loaded SB3 model and evaluate
for sb3_model_info in loaded_sb3_models:
    algorithm = sb3_model_info["algorithm"]
    model_name = sb3_model_info["model_name"]
    model = sb3_model_info["model"]
    
    print(f"\nEvaluating {algorithm} Model: {model_name}")
    
    # Define the path to save metrics
    model_metrics_dir = os.path.join(base_metrics_dir, model_name)
    metrics_save_path = os.path.join(model_metrics_dir, 'metrics.json')
    
    # Evaluate the model and save metrics
    metrics = evaluate_and_save(
        model=model,
        env=env,
        num_episodes=NUM_EPISODES,
        save_path=metrics_save_path
    )
    
    print(f"Saved metrics to {metrics_save_path}")
    
    # Append results to the list for the summary table
    evaluation_results.append({
        "Algorithm": algorithm,
        "Model Name": model_name,
        "Mean Reward": metrics["mean_reward"],
        "Std Reward": metrics["std_reward"]
    })


Evaluating DQN Model: dqn_model_10800
Episode 1: Reward = -21.0
Episode 2: Reward = -20.0
Episode 3: Reward = -20.0
Episode 4: Reward = -20.0
Episode 5: Reward = -18.0
Episode 6: Reward = -21.0
Episode 7: Reward = -21.0
Episode 8: Reward = -19.0
Episode 9: Reward = -19.0
Episode 10: Reward = -21.0
Saved metrics to ./metrics/sb3_eval/test/dqn_model_10800/metrics.json

Evaluating DQN Model: dqn_model_21600
Episode 1: Reward = -19.0
Episode 2: Reward = -21.0
Episode 3: Reward = -19.0
Episode 4: Reward = -21.0
Episode 5: Reward = -21.0
Episode 6: Reward = -18.0
Episode 7: Reward = -21.0
Episode 8: Reward = -21.0
Episode 9: Reward = -20.0
Episode 10: Reward = -21.0
Saved metrics to ./metrics/sb3_eval/test/dqn_model_21600/metrics.json

Evaluating DQN Model: dqn_model_32400
Episode 1: Reward = -21.0
Episode 2: Reward = -21.0
Episode 3: Reward = -21.0
Episode 4: Reward = -21.0
Episode 5: Reward = -20.0
Episode 6: Reward = -20.0
Episode 7: Reward = -21.0
Episode 8: Reward = -21.0
Episode 9: Re

# 5. Evaluation Summary

In [17]:
# Create a Pandas DataFrame from the evaluation results
results_df = pd.DataFrame(evaluation_results)

# Display the summary table
print("\nSummary of SB3 Models Evaluation:")
print(results_df)


Summary of SB3 Models Evaluation:
  Algorithm       Model Name  Mean Reward  Std Reward
0       DQN  dqn_model_10800        -20.0    1.000000
1       DQN  dqn_model_21600        -20.2    1.077033
2       DQN  dqn_model_32400        -20.7    0.458258
3       A2C  a2c_model_10800        -19.7    0.900000
4       A2C  a2c_model_21600        -19.4    1.113553
5       A2C  a2c_model_32400        -21.0    0.000000
6       PPO  ppo_model_10800        -20.5    0.500000
7       PPO  ppo_model_21600        -20.9    0.300000
8       PPO  ppo_model_32400        -21.0    0.000000
