# Snake RL - Results Analysis

This notebook analyzes training results and generates publication-ready figures for the IB Extended Essay.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 10

%matplotlib inline

## 1. Load Training Data

Load CSV logs from multiple training runs.

In [None]:
def load_run(run_path):
    """Load progress.csv from a run directory."""
    csv_file = Path(run_path) / "progress.csv"
    if csv_file.exists():
        return pd.read_csv(csv_file)
    else:
        print(f"Warning: {csv_file} not found")
        return None

# Example: Load your runs here
runs = {
    'Baseline': load_run('runs/baseline'),
    'Distance Shaping': load_run('runs/distance_shaping'),
    'Frame Stacking': load_run('runs/frame_stacking'),
}

# Remove None values
runs = {k: v for k, v in runs.items() if v is not None}

print(f"Loaded {len(runs)} runs:")
for name in runs.keys():
    print(f"  - {name}")

## 2. Learning Curves

Plot episode reward over time for all treatments.

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

for name, df in runs.items():
    if 'rollout/ep_rew_mean' in df.columns and 'time/total_timesteps' in df.columns:
        # Smooth with moving average
        window = 10
        x = df['time/total_timesteps']
        y_raw = df['rollout/ep_rew_mean']
        y_smooth = y_raw.rolling(window=window, min_periods=1).mean()
        
        ax.plot(x, y_smooth, label=name, linewidth=2)
        ax.plot(x, y_raw, alpha=0.2, linewidth=1, color=ax.lines[-1].get_color())

ax.set_xlabel('Timesteps', fontsize=12)
ax.set_ylabel('Episode Reward (mean)', fontsize=12)
ax.set_title('Learning Curves Comparison', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('learning_curves.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Sample Efficiency

Compare how quickly each treatment learns.

In [None]:
# Calculate area under curve (AUC) for each run
efficiency = {}

for name, df in runs.items():
    if 'rollout/ep_rew_mean' in df.columns and 'time/total_timesteps' in df.columns:
        # Use trapezoidal rule for AUC
        x = df['time/total_timesteps'].values
        y = df['rollout/ep_rew_mean'].values
        auc = np.trapz(y, x)
        efficiency[name] = auc / x[-1]  # Normalize by total timesteps

# Plot
fig, ax = plt.subplots(figsize=(8, 6))
names = list(efficiency.keys())
values = list(efficiency.values())

ax.bar(names, values, alpha=0.8)
ax.set_ylabel('Mean Reward (AUC normalized)', fontsize=12)
ax.set_title('Sample Efficiency Comparison', fontsize=14)
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('sample_efficiency.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nSample Efficiency (higher is better):")
for name, val in efficiency.items():
    print(f"  {name}: {val:.3f}")

## 4. Final Performance

Compare the final performance (last 100 timesteps).

In [None]:
n_last = 100
final_stats = {}

for name, df in runs.items():
    if 'rollout/ep_rew_mean' in df.columns:
        last_data = df.tail(n_last)['rollout/ep_rew_mean']
        final_stats[name] = {
            'mean': last_data.mean(),
            'std': last_data.std(),
            'min': last_data.min(),
            'max': last_data.max(),
        }

# Plot
fig, ax = plt.subplots(figsize=(8, 6))
names = list(final_stats.keys())
means = [stats['mean'] for stats in final_stats.values()]
stds = [stats['std'] for stats in final_stats.values()]

ax.bar(names, means, yerr=stds, capsize=5, alpha=0.8)
ax.set_ylabel('Episode Reward (mean ± std)', fontsize=12)
ax.set_title(f'Final Performance (last {n_last} samples)', fontsize=14)
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('final_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nFinal Performance Statistics:")
for name, stats_dict in final_stats.items():
    print(f"\n{name}:")
    print(f"  Mean: {stats_dict['mean']:.2f} ± {stats_dict['std']:.2f}")
    print(f"  Range: [{stats_dict['min']:.2f}, {stats_dict['max']:.2f}]")

## 5. Statistical Tests

Perform t-tests to determine if differences are statistically significant.

In [None]:
# Compare baseline with each treatment
if 'Baseline' in runs:
    baseline_data = runs['Baseline'].tail(n_last)['rollout/ep_rew_mean']
    
    print("Statistical Comparisons (vs. Baseline):\n")
    
    for name, df in runs.items():
        if name != 'Baseline' and 'rollout/ep_rew_mean' in df.columns:
            treatment_data = df.tail(n_last)['rollout/ep_rew_mean']
            
            # Independent samples t-test
            t_stat, p_value = stats.ttest_ind(baseline_data, treatment_data)
            
            # Cohen's d (effect size)
            pooled_std = np.sqrt((baseline_data.std()**2 + treatment_data.std()**2) / 2)
            cohens_d = (treatment_data.mean() - baseline_data.mean()) / pooled_std
            
            print(f"{name}:")
            print(f"  t-statistic: {t_stat:.3f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Cohen's d: {cohens_d:.3f}")
            
            if p_value < 0.05:
                print(f"  ✓ Significant difference (p < 0.05)")
            else:
                print(f"  ✗ No significant difference (p >= 0.05)")
            print()
else:
    print("No 'Baseline' run found for comparison")

## 6. Training Stability

Analyze variance in episode rewards over time.

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

for name, df in runs.items():
    if 'rollout/ep_rew_mean' in df.columns and 'time/total_timesteps' in df.columns:
        # Calculate rolling standard deviation
        window = 50
        x = df['time/total_timesteps']
        y_std = df['rollout/ep_rew_mean'].rolling(window=window, min_periods=1).std()
        
        ax.plot(x, y_std, label=name, linewidth=2)

ax.set_xlabel('Timesteps', fontsize=12)
ax.set_ylabel('Episode Reward Std Dev (rolling)', fontsize=12)
ax.set_title('Training Stability (lower is more stable)', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_stability.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Summary Table

Create a comprehensive summary table for the EE report.

In [None]:
summary_data = []

for name, df in runs.items():
    if 'rollout/ep_rew_mean' in df.columns:
        last_data = df.tail(n_last)['rollout/ep_rew_mean']
        
        summary_data.append({
            'Treatment': name,
            'Final Reward': f"{last_data.mean():.2f} ± {last_data.std():.2f}",
            'Sample Efficiency': f"{efficiency.get(name, 0):.3f}",
            'Stability (Std)': f"{last_data.std():.2f}",
            'Total Timesteps': f"{int(df['time/total_timesteps'].iloc[-1]):,}",
        })

summary_df = pd.DataFrame(summary_data)
print("\n=" * 80)
print("SUMMARY TABLE")
print("=" * 80)
print(summary_df.to_string(index=False))
print("=" * 80)

# Save to CSV for LaTeX/Word
summary_df.to_csv('results_summary.csv', index=False)
print("\n✓ Summary saved to results_summary.csv")

## 8. Export All Figures

Run this cell to regenerate all figures at publication quality.

In [None]:
print("All figures have been saved:")
print("  - learning_curves.png")
print("  - sample_efficiency.png")
print("  - final_performance.png")
print("  - training_stability.png")
print("  - results_summary.csv")
print("\n✓ Ready for use in your IB Extended Essay!")