# Agent Performance Analysis and Comparison

First run `python -m src.rl.evaluate` to generate the evaluation results

In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from glob import glob
from src import plotting_utils

# Set plotting style
sns.set_theme(context='paper', style='ticks', font_scale=1)

## Configuration

Update these paths to match your setup:

In [2]:
grid_size = "10x10"  # Set the correct grid size
results_path = f"../outputs/results/{grid_size}/testing"
figures_path = f"../figures/analysis/{grid_size}"

# Create figures directory if it doesn't exist
os.makedirs(figures_path, exist_ok=True)

# set figure parameters
width_pt = 469

Load all available evaluation results from the testing directory.

In [5]:
def load_evaluation_results(results_path):

    all_results = {}
    
    # Look for all JSON files in the results directory
    json_files = glob(os.path.join(results_path, "*.json"))
    
    print(f"Found {len(json_files)} JSON files in {results_path}")
    
    for json_file in json_files:
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)
            
            # Check if this is a comprehensive evaluation file (multiple agents)
            if isinstance(data, dict) and 'DQN' in data and 'Random' in data:
                # This is a comprehensive evaluation file
                for agent_name, agent_results in data.items():
                    all_results[agent_name] = agent_results
                print(f"Loaded comprehensive results from: {os.path.basename(json_file)}")
            
            # Check if this is a single agent evaluation file
            elif isinstance(data, dict) and 'agent_type' in data:
                agent_type = data['agent_type']
                all_results[agent_type.upper()] = data
                print(f"Loaded {agent_type} results from: {os.path.basename(json_file)}")
                
        except Exception as e:
            print(f"Error loading {json_file}: {e}")
    
    print(f"\nLoaded results for {len(all_results)} agents: {list(all_results.keys())}")
    return all_results

all_results = load_evaluation_results(results_path)

Found 1 JSON files in ../outputs/results/10x10/testing
Loaded comprehensive results from: all_agents_evaluation_20250814_231830.json

Loaded results for 9 agents: ['DQN', 'Random', 'Gen Greedy (r=1)', 'Gen Greedy (r=2)', 'Gen Greedy (r=3)', 'Gen Greedy (r=4)', 'Gen Greedy (r=5)', 'Gen Greedy (r=6)', 'Gen Greedy (r=7)']


Create a bar plot comparing all agents on the percentage of cells saved.

In [8]:
plotting_utils.latexify()

fig_width, fig_height = plotting_utils.get_fig_dim(width_pt, fraction=0.8)
fig, ax = plt.subplots(figsize=(fig_width, fig_height))

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

plot_data = []
for name, results in all_results.items():
    for instance_result in results['instance_results']:
        plot_data.append({
            'Agent': name, 
            'Cells Saved (%)': instance_result['cells_saved_pct']
        })
plot_data = pd.DataFrame(plot_data)

palette = sns.color_palette("husl", n_colors=len(plot_data['Agent'].unique()))
sns.barplot(data=plot_data, x='Agent', y='Cells Saved (%)', hue='Agent', errorbar='ci', ax=ax, palette=palette)
sns.despine(ax=ax)
# show a grid in the background every 10 on the y axis
ax.yaxis.set_major_locator(plt.MultipleLocator(10))
ax.grid(True)

ax.set_xlabel('Agent')
ax.set_ylabel(r'Cells Saved (\%)')
ax.tick_params(axis='x', rotation=45)

fig.tight_layout()
comparison_plot_path = f"{figures_path}/agent_comparison_{timestamp}.png"
fig.savefig(comparison_plot_path, dpi=300)

Analyze agent performance across different difficulty levels.

In [7]:
plotting_utils.latexify()

fig_width, fig_height = plotting_utils.get_fig_dim(width_pt, fraction=1.0)
fig, ax = plt.subplots(figsize=(fig_width, fig_height))

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

difficulty_order = ['very_easy', 'easy', 'medium', 'hard', 'very_hard']

difficulty_data = []
for agent_name, results in all_results.items():
    for instance_result in results['instance_results']:
        difficulty = instance_result.get('difficulty', 'unknown')
        if difficulty in difficulty_order:
            difficulty_data.append({
                'Agent': agent_name,
                'Difficulty': difficulty,
                'Cells Saved (%)': instance_result['cells_saved_pct']
            })

difficulty_df = pd.DataFrame(difficulty_data)

palette = sns.color_palette("husl", n_colors=len(difficulty_df['Agent'].unique()))
sns.barplot(data=difficulty_df, x='Difficulty', y='Cells Saved (%)', 
            hue='Agent', order=difficulty_order, errorbar='ci', ax=ax, palette=palette)
sns.despine(ax=ax)

ax.set_xlabel('Difficulty Level')
ax.set_ylabel(r'Cells Saved (\%)')
ax.tick_params(axis='x', rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

fig.tight_layout()
difficulty_plot_path = f"{figures_path}/difficulty_comparison_{timestamp}.png"
fig.savefig(difficulty_plot_path, dpi=300, bbox_inches='tight')

Plot how the DQN agent's test set performance evolved during training.

In [3]:
# Look for DQN test scores file
logs_path = f"../outputs/logs/{grid_size}/dqn"
score_files = glob(os.path.join(logs_path, "dqn_test_scores_*.json"))

# Use the most recent file
score_files.sort(key=os.path.getmtime, reverse=True)
score_file = score_files[0]

print(f"Loading DQN test scores from: {os.path.basename(score_file)}")

# Load the data
with open(score_file, 'r') as f:
    score_data = json.load(f)

# Extract episodes and scores
episodes = [entry['episode'] for entry in score_data]
test_scores = [entry['dqn_test_score'] for entry in score_data]

print(f"Found {len(episodes)} evaluation points")
print(f"Episode range: {episodes[0]} to {episodes[-1]}")
print(f"Score range: {min(test_scores):.1f}% to {max(test_scores):.1f}%")

Loading DQN test scores from: dqn_test_scores_20250814_184057.json
Found 30 evaluation points
Episode range: 5000 to 150000
Score range: 34.9% to 55.5%


In [4]:
plotting_utils.latexify()

fig_width, fig_height = plotting_utils.get_fig_dim(width_pt, fraction=0.8)
fig, ax = plt.subplots(figsize=(fig_width, fig_height))

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create DataFrame for seaborn plotting
training_df = pd.DataFrame({
    'Episode': episodes,
    'Score': test_scores
})

sns.lineplot(data=training_df, x='Episode', y='Score', ax=ax)
sns.despine(ax=ax)

ax.set_xlabel('Training Episodes')
ax.set_ylabel(r'Test Set Score (\% Tiles Saved)')

fig.tight_layout()
training_plot_path = f"{figures_path}/dqn_training_progress_{timestamp}.png"
fig.savefig(training_plot_path, dpi=300, bbox_inches='tight')