# Optimizer Comparison Analysis
## CSE 493S Project - Results Analysis and Visualization

**Team Members:** Jinghao Liu, Xuan Zhang, Yuzheng Zhang

This notebook analyzes and visualizes results from multiple optimizer experiments.

**Contents:**
1. Load and compare results from multiple experiments
2. Generate comparison plots
3. Statistical analysis
4. Hypothesis testing (H1, H2, H3)

## Setup

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import glob

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 10

print("✓ Libraries imported")

## Load Experiment Results

In [None]:
def load_results(results_dir='results', dataset='cifar10'):
    """Load all experiment results for a dataset."""
    results = {}
    
    pattern = f"{results_dir}/{dataset}_*_metrics.json"
    files = glob.glob(pattern)
    
    print(f"Found {len(files)} experiment results for {dataset.upper()}")
    
    for file in files:
        with open(file, 'r') as f:
            data = json.load(f)
            
        optimizer = data['config']['optimizer']
        seed = data['config']['seed']
        
        if optimizer not in results:
            results[optimizer] = []
        
        results[optimizer].append({
            'seed': seed,
            'history': data['history'],
            'best_accuracy': data['best_accuracy'],
            'final_accuracy': data['final_accuracy'],
            'config': data['config']
        })
    
    return results

# Load results
cifar10_results = load_results('results', 'cifar10')
cifar100_results = load_results('results', 'cifar100')

print(f"\nCIFAR-10 optimizers: {list(cifar10_results.keys())}")
print(f"CIFAR-100 optimizers: {list(cifar100_results.keys())}")

## Summary Statistics

In [None]:
def compute_statistics(results):
    """Compute mean and std for each optimizer."""
    stats = []
    
    for optimizer, runs in results.items():
        if not runs:
            continue
        
        best_accs = [r['best_accuracy'] for r in runs]
        final_accs = [r['final_accuracy'] for r in runs]
        
        stats.append({
            'Optimizer': optimizer.upper(),
            'Runs': len(runs),
            'Best Acc (%)': f"{np.mean(best_accs):.2f} ± {np.std(best_accs):.2f}",
            'Final Acc (%)': f"{np.mean(final_accs):.2f} ± {np.std(final_accs):.2f}",
            'Best (mean)': np.mean(best_accs),
            'Best (std)': np.std(best_accs)
        })
    
    return pd.DataFrame(stats)

# CIFAR-10 statistics
if cifar10_results:
    print("\n" + "=" * 80)
    print("CIFAR-10 Results")
    print("=" * 80)
    df_c10 = compute_statistics(cifar10_results)
    print(df_c10[['Optimizer', 'Runs', 'Best Acc (%)', 'Final Acc (%)']].to_string(index=False))

# CIFAR-100 statistics
if cifar100_results:
    print("\n" + "=" * 80)
    print("CIFAR-100 Results")
    print("=" * 80)
    df_c100 = compute_statistics(cifar100_results)
    print(df_c100[['Optimizer', 'Runs', 'Best Acc (%)', 'Final Acc (%)']].to_string(index=False))

## Comparison Plots

In [None]:
def plot_comparison(results, dataset_name, save=True):
    """Create comprehensive comparison plots."""
    if not results:
        print(f"No results found for {dataset_name}")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    colors = sns.color_palette("husl", len(results))
    
    # Plot 1: Training Loss
    ax = axes[0, 0]
    for (opt_name, runs), color in zip(results.items(), colors):
        losses = [r['history']['train_loss'] for r in runs]
        max_len = max(len(l) for l in losses)
        
        # Pad sequences
        padded = [l + [l[-1]] * (max_len - len(l)) for l in losses]
        losses_array = np.array(padded)
        
        mean_loss = np.mean(losses_array, axis=0)
        std_loss = np.std(losses_array, axis=0)
        epochs = range(1, len(mean_loss) + 1)
        
        ax.plot(epochs, mean_loss, label=opt_name.upper(), linewidth=2, color=color)
        ax.fill_between(epochs, mean_loss - std_loss, mean_loss + std_loss, alpha=0.2, color=color)
    
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Training Loss')
    ax.set_title('Training Loss')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Plot 2: Test Accuracy
    ax = axes[0, 1]
    for (opt_name, runs), color in zip(results.items(), colors):
        accs = [r['history']['test_acc'] for r in runs]
        max_len = max(len(a) for a in accs)
        padded = [a + [a[-1]] * (max_len - len(a)) for a in accs]
        accs_array = np.array(padded)
        
        mean_acc = np.mean(accs_array, axis=0)
        std_acc = np.std(accs_array, axis=0)
        epochs = range(1, len(mean_acc) + 1)
        
        ax.plot(epochs, mean_acc, label=opt_name.upper(), linewidth=2, color=color)
        ax.fill_between(epochs, mean_acc - std_acc, mean_acc + std_acc, alpha=0.2, color=color)
    
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Test Accuracy (%)')
    ax.set_title('Test Accuracy')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Plot 3: Final Accuracy Bar Chart
    ax = axes[1, 0]
    optimizers = list(results.keys())
    final_means = [np.mean([r['final_accuracy'] for r in results[opt]]) for opt in optimizers]
    final_stds = [np.std([r['final_accuracy'] for r in results[opt]]) for opt in optimizers]
    
    x_pos = np.arange(len(optimizers))
    ax.bar(x_pos, final_means, yerr=final_stds, capsize=5, alpha=0.7, color=colors)
    ax.set_xticks(x_pos)
    ax.set_xticklabels([opt.upper() for opt in optimizers])
    ax.set_ylabel('Test Accuracy (%)')
    ax.set_title('Final Test Accuracy')
    ax.grid(True, alpha=0.3, axis='y')
    
    for i, (mean, std) in enumerate(zip(final_means, final_stds)):
        ax.text(i, mean + std + 0.5, f'{mean:.2f}±{std:.2f}', ha='center', fontsize=9)
    
    # Plot 4: Best Accuracy Bar Chart
    ax = axes[1, 1]
    best_means = [np.mean([r['best_accuracy'] for r in results[opt]]) for opt in optimizers]
    best_stds = [np.std([r['best_accuracy'] for r in results[opt]]) for opt in optimizers]
    
    ax.bar(x_pos, best_means, yerr=best_stds, capsize=5, alpha=0.7, color=colors)
    ax.set_xticks(x_pos)
    ax.set_xticklabels([opt.upper() for opt in optimizers])
    ax.set_ylabel('Test Accuracy (%)')
    ax.set_title('Best Test Accuracy')
    ax.grid(True, alpha=0.3, axis='y')
    
    for i, (mean, std) in enumerate(zip(best_means, best_stds)):
        ax.text(i, mean + std + 0.5, f'{mean:.2f}±{std:.2f}', ha='center', fontsize=9)
    
    plt.suptitle(f'Optimizer Comparison on {dataset_name}', fontsize=14, fontweight='bold')
    plt.tight_layout()
    
    if save:
        plt.savefig(f'{dataset_name}_comparison.png', dpi=300, bbox_inches='tight')
        print(f"✓ Plot saved as {dataset_name}_comparison.png")
    
    plt.show()

# Generate plots
plot_comparison(cifar10_results, 'CIFAR-10')
plot_comparison(cifar100_results, 'CIFAR-100')

## Hypothesis Testing

### H1: RAdam Early Training Stability

Test if RAdam has lower variance in the first 10 epochs compared to Adam.

In [None]:
def test_h1_early_stability(results):
    """Test H1: RAdam early training stability."""
    if 'adam' not in results or 'radam' not in results:
        print("⚠️  Need both Adam and RAdam results for H1")
        return
    
    print("\n" + "=" * 70)
    print("H1: RAdam Early Training Stability (First 10 Epochs)")
    print("=" * 70)
    
    # Get losses for first 10 epochs
    adam_losses = [r['history']['train_loss'][:10] for r in results['adam']]
    radam_losses = [r['history']['train_loss'][:10] for r in results['radam']]
    
    # Calculate coefficient of variation
    adam_cv = np.mean([np.std(l) / np.mean(l) for l in adam_losses])
    radam_cv = np.mean([np.std(l) / np.mean(l) for l in radam_losses])
    
    reduction = (adam_cv - radam_cv) / adam_cv * 100
    
    print(f"Adam CV:  {adam_cv:.4f}")
    print(f"RAdam CV: {radam_cv:.4f}")
    print(f"Reduction: {reduction:.1f}%")
    
    if reduction >= 20:
        print("\n✓ H1 SUPPORTED: RAdam reduces variance by ≥20%")
    else:
        print(f"\n✗ H1 NOT SUPPORTED: Reduction is {reduction:.1f}% < 20%")
    
    # Plot
    plt.figure(figsize=(10, 5))
    
    for losses in adam_losses:
        plt.plot(range(1, 11), losses, 'o-', alpha=0.5, color='blue')
    for losses in radam_losses:
        plt.plot(range(1, 11), losses, 's-', alpha=0.5, color='red')
    
    plt.plot([], [], 'o-', color='blue', label='Adam', linewidth=2)
    plt.plot([], [], 's-', color='red', label='RAdam', linewidth=2)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('H1: Early Training Stability Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('H1_early_stability.png', dpi=300)
    plt.show()

test_h1_early_stability(cifar100_results if cifar100_results else cifar10_results)

### H2: AdamW Regularization Effect

Test if AdamW shows smaller train-test gap than Adam with high weight decay.

In [None]:
def test_h2_regularization(results):
    """Test H2: AdamW regularization effect."""
    if 'adam' not in results or 'adamw' not in results:
        print("⚠️  Need both Adam and AdamW results for H2")
        return
    
    print("\n" + "=" * 70)
    print("H2: AdamW Regularization Effect")
    print("=" * 70)
    
    def compute_gap(runs):
        gaps = []
        for r in runs:
            train_acc = r['history']['train_acc'][-1]
            test_acc = r['history']['test_acc'][-1]
            gaps.append(train_acc - test_acc)
        return np.mean(gaps), np.std(gaps)
    
    adam_gap_mean, adam_gap_std = compute_gap(results['adam'])
    adamw_gap_mean, adamw_gap_std = compute_gap(results['adamw'])
    
    print(f"Adam gap:  {adam_gap_mean:.2f} ± {adam_gap_std:.2f}%")
    print(f"AdamW gap: {adamw_gap_mean:.2f} ± {adamw_gap_std:.2f}%")
    print(f"Reduction: {adam_gap_mean - adamw_gap_mean:.2f}%")
    
    if adamw_gap_mean < adam_gap_mean:
        print("\n✓ H2 SUPPORTED: AdamW has smaller train-test gap")
    else:
        print("\n✗ H2 NOT SUPPORTED: AdamW gap not smaller")

test_h2_regularization(cifar10_results)

### H3: Lion Robustness to Label Noise

Test if Lion maintains better accuracy than Adam under label noise.

In [None]:
def test_h3_noise_robustness():
    """Test H3: Lion robustness to label noise."""
    print("\n" + "=" * 70)
    print("H3: Lion Robustness to Label Noise")
    print("=" * 70)
    print("\nTo test H3, run experiments with label_noise=0.2:")
    print("  python src/train.py --optimizer adam --label-noise 0.2 --seed 42")
    print("  python src/train.py --optimizer lion --label-noise 0.2 --seed 42")
    print("\nThen load results and compare final accuracies.")

test_h3_noise_robustness()

## Convergence Analysis

In [None]:
def analyze_convergence(results, target_acc=90.0):
    """Analyze convergence speed (epochs to reach target accuracy)."""
    print("\n" + "=" * 70)
    print(f"Convergence Speed (Epochs to reach {target_acc}% accuracy)")
    print("=" * 70)
    
    for opt_name, runs in results.items():
        epochs_to_target = []
        
        for r in runs:
            test_accs = r['history']['test_acc']
            for i, acc in enumerate(test_accs):
                if acc >= target_acc:
                    epochs_to_target.append(i + 1)
                    break
            else:
                epochs_to_target.append(len(test_accs))  # Never reached
        
        mean_epochs = np.mean(epochs_to_target)
        std_epochs = np.std(epochs_to_target)
        
        print(f"{opt_name.upper():10s}: {mean_epochs:.1f} ± {std_epochs:.1f} epochs")

if cifar10_results:
    analyze_convergence(cifar10_results, target_acc=90.0)

if cifar100_results:
    analyze_convergence(cifar100_results, target_acc=70.0)

## Export Summary Table

In [None]:
# Create LaTeX table for report
if cifar10_results:
    print("\n" + "=" * 70)
    print("LaTeX Table (CIFAR-10)")
    print("=" * 70)
    print()
    print(df_c10[['Optimizer', 'Runs', 'Best Acc (%)', 'Final Acc (%)']].to_latex(index=False))

# Save to CSV
if cifar10_results:
    df_c10.to_csv('cifar10_summary.csv', index=False)
    print("\n✓ CIFAR-10 summary saved to cifar10_summary.csv")

if cifar100_results:
    df_c100.to_csv('cifar100_summary.csv', index=False)
    print("✓ CIFAR-100 summary saved to cifar100_summary.csv")

## Summary

This notebook provides:
1. Statistical summary of all experiments
2. Comprehensive comparison plots
3. Hypothesis testing (H1, H2, H3)
4. Convergence analysis
5. Export capabilities for reports

**For your report:**
- Use the comparison plots
- Include the summary statistics table
- Report hypothesis testing results
- Analyze convergence speed differences