## Packages


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from typing import Tuple
import sys
from pathlib import Path
from datetime import datetime
import os
import pyro

# Add parent directory to path to import Models
# This works for notebooks in the Experiments folder
project_root = Path.cwd().parent if Path.cwd().name == 'Experiments' else Path.cwd()
sys.path.insert(0, str(project_root))

# Setup results directory
results_dir = project_root / "results" / "ood_parameter_comparison"
results_dir.mkdir(parents=True, exist_ok=True)
plots_dir = results_dir / "plots"
plots_dir.mkdir(exist_ok=True)
stats_dir = results_dir / "statistics"
stats_dir.mkdir(exist_ok=True)

print(f"Results will be saved to: {results_dir}")

# Import from Models folder
from Models.MC_Dropout import (
    MCDropoutRegressor,
    train_model,
    mc_dropout_predict,
    gaussian_nll,
    beta_nll,
    plot_toy_data,
    plot_uncertainties,
    normalize_x,
    normalize_x_data
)

from Models.Deep_Ensemble import (
    train_ensemble_deep,
    ensemble_predict_deep
)

from utils.device import get_device
from utils.plotting import plot_toy_data, plot_uncertainties_ood
import utils.results_save as results_save_module
from utils.results_save import save_plot, save_statistics

# Import OOD helper functions
from utils.ood_experiments import (
    generate_data_with_ood,
    compute_and_save_statistics_ood
)

# Set the module-level directories for results_save
results_save_module.plots_dir = plots_dir
results_save_module.stats_dir = stats_dir


Error importing in API mode: ImportError('On Windows, cffi mode "ANY" is only "ABI".')
Trying to import in ABI mode.


Results will be saved to: c:\Users\lukas\OneDrive\Desktop\Code-Masterarbeit\A-statistical-evaluation-of-uncertainty-disentanglement-methods-1\results\ood_parameter_comparison
CUDA not available. Using CPU.
CUDA not available. Using CPU.


## Device Setup


In [None]:
device = get_device()


## Generate Toy Datasets


In [None]:
# Reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# ----- Data generation for linear function with homo/heteroscedastic noise -----
def generate_toy_regression(n_train=1000, train_range=(0.0, 10.0), train_ranges=None,
                           ood_ranges=None, grid_points=1000, noise_type='heteroscedastic', type = "linear"):
    """
    Generate toy regression data with support for multiple training ranges and OOD regions.
    
    Args:
        n_train: Number of training samples
        train_range: Single training range tuple (min, max) - for backward compatibility
        train_ranges: List of training range tuples [(min1, max1), (min2, max2), ...]
                     If provided, overrides train_range. Samples are distributed proportionally.
        ood_ranges: List of OOD range tuples [(min1, max1), (min2, max2), ...]
                   If None, OOD is automatically everything NOT in training ranges
        grid_points: Number of grid points for evaluation
        noise_type: 'homoscedastic' or 'heteroscedastic'
        type: 'linear' or 'sin'
    
    Returns:
        (x_train, y_train, x_grid, y_grid_clean, ood_mask)
    """
    # Handle train_ranges: if provided, use it; otherwise use train_range as single range
    if train_ranges is None:
        train_ranges = [train_range]
    else:
        # train_ranges provided, ignore train_range
        pass
    
    # Sample training data proportionally from each training range
    # Calculate total width of all training ranges
    total_width = sum([r[1] - r[0] for r in train_ranges])
    
    # Sample from each range proportionally
    x_train_list = []
    samples_allocated = 0
    for idx, train_r in enumerate(train_ranges):
        low, high = train_r
        range_width = high - low
        # Number of samples proportional to range width
        if idx == len(train_ranges) - 1:
            # Last range gets remaining samples to ensure exact total
            n_samples = n_train - samples_allocated
        else:
            n_samples = int(n_train * range_width / total_width)
            samples_allocated += n_samples
        x_train_range = np.random.uniform(low, high, size=(n_samples, 1))
        x_train_list.append(x_train_range)
    
    x_train = np.vstack(x_train_list)
    # Shuffle to mix samples from different ranges
    indices = np.random.permutation(len(x_train))
    x_train = x_train[indices]
    
    if type == "linear":
        # Linear function: f(x) = 0.7x + 0.5
        f_clean = lambda x: 0.7 * x + 0.5
        y_clean_train = f_clean(x_train)
    elif type == "sin":
        f_clean = lambda x:  x * np.sin(x) + x
        y_clean_train = f_clean(x_train)
    else:
        raise ValueError("type must be 'linear', 'sin'")

    # Define noise variance σ²(x)
    if noise_type == 'homoscedastic':
        # Homoscedastic: σ(x) = 0.8
        sigma = 2
        sigma_train = np.full_like(x_train, sigma)
    elif noise_type == 'heteroscedastic':
        # Heteroscedastic: 
        sigma_train = np.abs(2.5 * np.sin(0.5*x_train +5))
    else:
        raise ValueError("noise_type must be 'homoscedastic' or 'heteroscedastic'")
    
    # Generate noise: ε | x ~ N(0, σ²(x))
    epsilon = np.random.normal(0.0, sigma_train, size=(n_train, 1))
    y_train = y_clean_train + epsilon

    # Determine grid extent: from min of all training/OOD ranges to max
    all_ranges = train_ranges + (ood_ranges if ood_ranges else [])
    grid_start = min([r[0] for r in all_ranges])
    grid_end = max([r[1] for r in all_ranges])
    
    # Dense evaluation grid spanning all training and OOD regions
    x_grid = np.linspace(grid_start, grid_end, grid_points).reshape(-1, 1)
    y_grid_clean = f_clean(x_grid)
    
    # Create OOD mask: True for points NOT in any training range
    # Everything outside training ranges is OOD (including gaps and explicit OOD ranges)
    ood_mask = np.ones(len(x_grid), dtype=bool)  # Start with all True (OOD)
    
    # Mark training ranges as ID (False in ood_mask)
    for train_r in train_ranges:
        train_start, train_end = train_r
        train_mask = (x_grid[:, 0] >= train_start) & (x_grid[:, 0] <= train_end)
        ood_mask[train_mask] = False  # Training regions are ID, not OOD
    
    # If explicit ood_ranges provided, ensure they are marked as OOD
    # (they might already be OOD if they're gaps, but this ensures they're marked)
    if ood_ranges is not None:
        for ood_range in ood_ranges:
            ood_start, ood_end = ood_range
            ood_mask |= (x_grid[:, 0] >= ood_start) & (x_grid[:, 0] <= ood_end)

    return (x_train.astype(np.float32), y_train.astype(np.float32),
            x_grid.astype(np.float32), y_grid_clean.astype(np.float32), ood_mask)


### Set Parameters


In [None]:
# Common parameters
n_train = 1000
train_range = (-5, 10)
ood_ranges = [(30, 40)]  # List of (min, max) tuples for OOD regions
grid_points = 1000
seed = 42
noise_type = 'heteroscedastic'
func_type = 'sin'  # or 'linear'
function_name = "Sinusoidal" if func_type == 'sin' else "Linear"

# Parameters to vary
mc_samples_values = [10, 20, 50, 100]  # MC Dropout forward passes
dropout_p_values = [0.2]  # Can add more: [0.1, 0.2, 0.3]
K_values = [3, 5, 10, 20]  # Deep Ensemble number of nets

# Fixed training parameters
beta = 0.5
epochs = 700
lr = 1e-3
batch_size = 32

torch.manual_seed(seed)


## Helper Functions for Parameter Comparison


In [None]:
def run_single_mc_dropout_ood(generate_toy_regression_func, x_train, y_train, x_grid, y_grid_clean, ood_mask,
                              p, mc_samples, beta, epochs, lr, batch_size, seed, 
                              function_name, noise_type, func_type, date, save_results=True):
    """Run a single MC Dropout OOD experiment and return results"""
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    ds = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
    loader = DataLoader(ds, batch_size=batch_size, shuffle=True)
    
    model = MCDropoutRegressor(p=p)
    train_model(model, loader, epochs=epochs, lr=lr, loss_type='beta_nll', beta=beta)
    
    # Make predictions
    mu_pred, ale_var, epi_var, tot_var = mc_dropout_predict(model, x_grid, M=mc_samples)
    
    # Split uncertainties by region
    id_mask = ~ood_mask
    
    uncertainties_id = {
        'ale': ale_var[id_mask] if ale_var.ndim == 1 else ale_var[id_mask].flatten(),
        'epi': epi_var[id_mask] if epi_var.ndim == 1 else epi_var[id_mask].flatten(),
        'tot': tot_var[id_mask] if tot_var.ndim == 1 else tot_var[id_mask].flatten()
    }
    
    uncertainties_ood = {
        'ale': ale_var[ood_mask] if ale_var.ndim == 1 else ale_var[ood_mask].flatten(),
        'epi': epi_var[ood_mask] if epi_var.ndim == 1 else epi_var[ood_mask].flatten(),
        'tot': tot_var[ood_mask] if tot_var.ndim == 1 else tot_var[ood_mask].flatten()
    }
    
    uncertainties_combined = {
        'ale': ale_var.flatten() if ale_var.ndim > 1 else ale_var,
        'epi': epi_var.flatten() if epi_var.ndim > 1 else epi_var,
        'tot': tot_var.flatten() if tot_var.ndim > 1 else tot_var
    }
    
    # Compute MSE separately
    mu_pred_flat = mu_pred.squeeze() if mu_pred.ndim > 1 else mu_pred
    y_grid_clean_flat = y_grid_clean.squeeze() if y_grid_clean.ndim > 1 else y_grid_clean
    
    mse_id = np.mean((mu_pred_flat[id_mask] - y_grid_clean_flat[id_mask])**2)
    mse_ood = np.mean((mu_pred_flat[ood_mask] - y_grid_clean_flat[ood_mask])**2)
    mse_combined = np.mean((mu_pred_flat - y_grid_clean_flat)**2)
    
    # Save statistics if requested
    if save_results:
        compute_and_save_statistics_ood(
            uncertainties_id, uncertainties_ood, uncertainties_combined,
            mse_id, mse_ood, mse_combined,
            function_name, noise_type, func_type, 'MC_Dropout',
            date=date, dropout_p=p, mc_samples=mc_samples
        )
    
    return {
        'uncertainties_id': uncertainties_id,
        'uncertainties_ood': uncertainties_ood,
        'uncertainties_combined': uncertainties_combined,
        'mse_id': mse_id,
        'mse_ood': mse_ood,
        'mse_combined': mse_combined,
        'mu_pred': mu_pred,
        'ale_var': ale_var,
        'epi_var': epi_var,
        'tot_var': tot_var
    }


def run_single_deep_ensemble_ood(generate_toy_regression_func, x_train, y_train, x_grid, y_grid_clean, ood_mask,
                                 K, beta, batch_size, seed,
                                 function_name, noise_type, func_type, date, save_results=True):
    """Run a single Deep Ensemble OOD experiment and return results"""
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    x_mean, x_std = normalize_x(x_train)
    x_train_norm = normalize_x_data(x_train, x_mean, x_std)
    x_grid_norm = normalize_x_data(x_grid, x_mean, x_std)
    
    ensemble = train_ensemble_deep(
        x_train_norm, y_train,
        batch_size=batch_size, K=K,
        loss_type='beta_nll', beta=beta, parallel=True
    )
    
    # Make predictions
    mu_pred, ale_var, epi_var, tot_var = ensemble_predict_deep(ensemble, x_grid_norm)
    
    # Split uncertainties by region
    id_mask = ~ood_mask
    
    uncertainties_id = {
        'ale': ale_var[id_mask] if ale_var.ndim == 1 else ale_var[id_mask].flatten(),
        'epi': epi_var[id_mask] if epi_var.ndim == 1 else epi_var[id_mask].flatten(),
        'tot': tot_var[id_mask] if tot_var.ndim == 1 else tot_var[id_mask].flatten()
    }
    
    uncertainties_ood = {
        'ale': ale_var[ood_mask] if ale_var.ndim == 1 else ale_var[ood_mask].flatten(),
        'epi': epi_var[ood_mask] if epi_var.ndim == 1 else epi_var[ood_mask].flatten(),
        'tot': tot_var[ood_mask] if tot_var.ndim == 1 else tot_var[ood_mask].flatten()
    }
    
    uncertainties_combined = {
        'ale': ale_var.flatten() if ale_var.ndim > 1 else ale_var,
        'epi': epi_var.flatten() if epi_var.ndim > 1 else epi_var,
        'tot': tot_var.flatten() if tot_var.ndim > 1 else tot_var
    }
    
    # Compute MSE separately
    mu_pred_flat = mu_pred.squeeze() if mu_pred.ndim > 1 else mu_pred
    y_grid_clean_flat = y_grid_clean.squeeze() if y_grid_clean.ndim > 1 else y_grid_clean
    
    mse_id = np.mean((mu_pred_flat[id_mask] - y_grid_clean_flat[id_mask])**2)
    mse_ood = np.mean((mu_pred_flat[ood_mask] - y_grid_clean_flat[ood_mask])**2)
    mse_combined = np.mean((mu_pred_flat - y_grid_clean_flat)**2)
    
    # Save statistics if requested
    if save_results:
        compute_and_save_statistics_ood(
            uncertainties_id, uncertainties_ood, uncertainties_combined,
            mse_id, mse_ood, mse_combined,
            function_name, noise_type, func_type, 'Deep_Ensemble',
            date=date, n_nets=K
        )
    
    return {
        'uncertainties_id': uncertainties_id,
        'uncertainties_ood': uncertainties_ood,
        'uncertainties_combined': uncertainties_combined,
        'mse_id': mse_id,
        'mse_ood': mse_ood,
        'mse_combined': mse_combined,
        'mu_pred': mu_pred,
        'ale_var': ale_var,
        'epi_var': epi_var,
        'tot_var': tot_var
    }


## Generate Data (Once)


In [None]:
# Generate data once (same for all parameter variations)
np.random.seed(seed)
torch.manual_seed(seed)

x_train, y_train, x_grid, y_grid_clean, ood_mask = generate_data_with_ood(
    generate_toy_regression, n_train, train_range, ood_ranges,
    grid_points, noise_type, func_type, seed
)

print(f"Training range: {train_range}")
print(f"OOD ranges: {ood_ranges}")
print(f"Grid spans: [{x_grid[0, 0]:.2f}, {x_grid[-1, 0]:.2f}]")
print(f"ID points: {np.sum(~ood_mask)}, OOD points: {np.sum(ood_mask)}")
print(f"Function type: {function_name} ({func_type})")
print(f"Noise type: {noise_type}\n")


## MC Dropout - Vary mc_samples


In [None]:
# Generate date for this experiment batch
date = datetime.now().strftime('%Y%m%d')

# Store results for comparison
results_mc_dropout = {}

print(f"\n{'='*80}")
print(f"MC Dropout Parameter Comparison - Varying mc_samples")
print(f"{'='*80}\n")

for p in dropout_p_values:
    for mc_samples in mc_samples_values:
        param_key = f"p{p}_M{mc_samples}"
        print(f"\n{'='*60}")
        print(f"Testing: p={p}, mc_samples={mc_samples}")
        print(f"{'='*60}")
        
        result = run_single_mc_dropout_ood(
            generate_toy_regression, x_train, y_train, x_grid, y_grid_clean, ood_mask,
            p=p, mc_samples=mc_samples, beta=beta, epochs=epochs, lr=lr, batch_size=batch_size,
            seed=seed, function_name=function_name, noise_type=noise_type, func_type=func_type,
            date=date, save_results=True
        )
        
        results_mc_dropout[param_key] = result
        
        # Print summary
        print(f"  ID - Avg Ale: {np.mean(result['uncertainties_id']['ale']):.6f}, "
              f"Avg Epi: {np.mean(result['uncertainties_id']['epi']):.6f}, "
              f"MSE: {result['mse_id']:.6f}")
        print(f"  OOD - Avg Ale: {np.mean(result['uncertainties_ood']['ale']):.6f}, "
              f"Avg Epi: {np.mean(result['uncertainties_ood']['epi']):.6f}, "
              f"MSE: {result['mse_ood']:.6f}")

print(f"\n{'='*80}")
print("MC Dropout experiments completed!")
print(f"{'='*80}\n")


## MC Dropout - Comparison Plots


In [None]:
# Extract data for plotting
mc_samples_list = []
avg_ale_id_list = []
avg_epi_id_list = []
avg_tot_id_list = []
avg_ale_ood_list = []
avg_epi_ood_list = []
avg_tot_ood_list = []
mse_id_list = []
mse_ood_list = []

for param_key, result in results_mc_dropout.items():
    # Extract mc_samples from param_key (format: "p0.2_M20")
    mc_samples_val = int(param_key.split('_M')[1])
    mc_samples_list.append(mc_samples_val)
    
    avg_ale_id_list.append(np.mean(result['uncertainties_id']['ale']))
    avg_epi_id_list.append(np.mean(result['uncertainties_id']['epi']))
    avg_tot_id_list.append(np.mean(result['uncertainties_id']['tot']))
    
    avg_ale_ood_list.append(np.mean(result['uncertainties_ood']['ale']))
    avg_epi_ood_list.append(np.mean(result['uncertainties_ood']['epi']))
    avg_tot_ood_list.append(np.mean(result['uncertainties_ood']['tot']))
    
    mse_id_list.append(result['mse_id'])
    mse_ood_list.append(result['mse_ood'])

# Sort by mc_samples
sorted_indices = np.argsort(mc_samples_list)
mc_samples_list = [mc_samples_list[i] for i in sorted_indices]
avg_ale_id_list = [avg_ale_id_list[i] for i in sorted_indices]
avg_epi_id_list = [avg_epi_id_list[i] for i in sorted_indices]
avg_tot_id_list = [avg_tot_id_list[i] for i in sorted_indices]
avg_ale_ood_list = [avg_ale_ood_list[i] for i in sorted_indices]
avg_epi_ood_list = [avg_epi_ood_list[i] for i in sorted_indices]
avg_tot_ood_list = [avg_tot_ood_list[i] for i in sorted_indices]
mse_id_list = [mse_id_list[i] for i in sorted_indices]
mse_ood_list = [mse_ood_list[i] for i in sorted_indices]

# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Average Uncertainties - ID region
axes[0, 0].plot(mc_samples_list, avg_ale_id_list, 'o-', label='Aleatoric (ID)', color='green', linewidth=2, markersize=8)
axes[0, 0].plot(mc_samples_list, avg_epi_id_list, 's-', label='Epistemic (ID)', color='orange', linewidth=2, markersize=8)
axes[0, 0].plot(mc_samples_list, avg_tot_id_list, '^-', label='Total (ID)', color='blue', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('MC Samples', fontsize=12)
axes[0, 0].set_ylabel('Average Uncertainty', fontsize=12)
axes[0, 0].set_title(f'MC Dropout: Average Uncertainties (ID) vs MC Samples\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[0, 0].legend(fontsize=10)
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(mc_samples_list)

# Plot 2: Average Uncertainties - OOD region
axes[0, 1].plot(mc_samples_list, avg_ale_ood_list, 'o-', label='Aleatoric (OOD)', color='green', linewidth=2, markersize=8)
axes[0, 1].plot(mc_samples_list, avg_epi_ood_list, 's-', label='Epistemic (OOD)', color='orange', linewidth=2, markersize=8)
axes[0, 1].plot(mc_samples_list, avg_tot_ood_list, '^-', label='Total (OOD)', color='blue', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('MC Samples', fontsize=12)
axes[0, 1].set_ylabel('Average Uncertainty', fontsize=12)
axes[0, 1].set_title(f'MC Dropout: Average Uncertainties (OOD) vs MC Samples\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[0, 1].legend(fontsize=10)
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(mc_samples_list)

# Plot 3: MSE comparison
axes[1, 0].plot(mc_samples_list, mse_id_list, 'o-', label='MSE (ID)', color='blue', linewidth=2, markersize=8)
axes[1, 0].plot(mc_samples_list, mse_ood_list, 's-', label='MSE (OOD)', color='red', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('MC Samples', fontsize=12)
axes[1, 0].set_ylabel('MSE', fontsize=12)
axes[1, 0].set_title(f'MC Dropout: MSE vs MC Samples\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[1, 0].legend(fontsize=10)
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_yscale('log')
axes[1, 0].set_xticks(mc_samples_list)

# Plot 4: ID vs OOD comparison (bar chart for one parameter)
x_pos = np.arange(len(mc_samples_list))
width = 0.35
axes[1, 1].bar(x_pos - width/2, avg_epi_id_list, width, label='Epistemic (ID)', color='orange', alpha=0.7)
axes[1, 1].bar(x_pos + width/2, avg_epi_ood_list, width, label='Epistemic (OOD)', color='red', alpha=0.7)
axes[1, 1].set_xlabel('MC Samples', fontsize=12)
axes[1, 1].set_ylabel('Average Epistemic Uncertainty', fontsize=12)
axes[1, 1].set_title(f'MC Dropout: Epistemic Uncertainty - ID vs OOD\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[1, 1].set_xticks(x_pos)
axes[1, 1].set_xticklabels(mc_samples_list)
axes[1, 1].legend(fontsize=10)
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.suptitle(f'MC Dropout Parameter Comparison: Varying MC Samples (p={dropout_p_values[0]})', 
             fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()

# Save plot
save_plot(fig, f"MC_Dropout_mc_samples_comparison_{function_name}_{noise_type}", 
          subfolder=f"comparisons/{noise_type}/{func_type}")
plt.show()
plt.close(fig)

# Create summary table
comparison_df = pd.DataFrame({
    'MC_Samples': mc_samples_list,
    'Avg_Ale_ID': avg_ale_id_list,
    'Avg_Epi_ID': avg_epi_id_list,
    'Avg_Tot_ID': avg_tot_id_list,
    'Avg_Ale_OOD': avg_ale_ood_list,
    'Avg_Epi_OOD': avg_epi_ood_list,
    'Avg_Tot_OOD': avg_tot_ood_list,
    'MSE_ID': mse_id_list,
    'MSE_OOD': mse_ood_list
})

print("\nSummary Table:")
print(comparison_df.to_string(index=False))

# Save comparison table
save_statistics(comparison_df, f"MC_Dropout_mc_samples_comparison_{function_name}_{noise_type}",
                subfolder=f"comparisons/{noise_type}/{func_type}")


## Deep Ensemble - Vary K


In [None]:
# Store results for comparison
results_deep_ensemble = {}

print(f"\n{'='*80}")
print(f"Deep Ensemble Parameter Comparison - Varying K")
print(f"{'='*80}\n")

for K in K_values:
    param_key = f"K{K}"
    print(f"\n{'='*60}")
    print(f"Testing: K={K}")
    print(f"{'='*60}")
    
    result = run_single_deep_ensemble_ood(
        generate_toy_regression, x_train, y_train, x_grid, y_grid_clean, ood_mask,
        K=K, beta=beta, batch_size=batch_size, seed=seed,
        function_name=function_name, noise_type=noise_type, func_type=func_type,
        date=date, save_results=True
    )
    
    results_deep_ensemble[param_key] = result
    
    # Print summary
    print(f"  ID - Avg Ale: {np.mean(result['uncertainties_id']['ale']):.6f}, "
          f"Avg Epi: {np.mean(result['uncertainties_id']['epi']):.6f}, "
          f"MSE: {result['mse_id']:.6f}")
    print(f"  OOD - Avg Ale: {np.mean(result['uncertainties_ood']['ale']):.6f}, "
          f"Avg Epi: {np.mean(result['uncertainties_ood']['epi']):.6f}, "
          f"MSE: {result['mse_ood']:.6f}")

print(f"\n{'='*80}")
print("Deep Ensemble experiments completed!")
print(f"{'='*80}\n")


## Deep Ensemble - Comparison Plots


In [None]:
# Extract data for plotting
K_list = []
avg_ale_id_list_de = []
avg_epi_id_list_de = []
avg_tot_id_list_de = []
avg_ale_ood_list_de = []
avg_epi_ood_list_de = []
avg_tot_ood_list_de = []
mse_id_list_de = []
mse_ood_list_de = []

for param_key, result in results_deep_ensemble.items():
    # Extract K from param_key (format: "K5")
    K_val = int(param_key[1:])
    K_list.append(K_val)
    
    avg_ale_id_list_de.append(np.mean(result['uncertainties_id']['ale']))
    avg_epi_id_list_de.append(np.mean(result['uncertainties_id']['epi']))
    avg_tot_id_list_de.append(np.mean(result['uncertainties_id']['tot']))
    
    avg_ale_ood_list_de.append(np.mean(result['uncertainties_ood']['ale']))
    avg_epi_ood_list_de.append(np.mean(result['uncertainties_ood']['epi']))
    avg_tot_ood_list_de.append(np.mean(result['uncertainties_ood']['tot']))
    
    mse_id_list_de.append(result['mse_id'])
    mse_ood_list_de.append(result['mse_ood'])

# Sort by K
sorted_indices = np.argsort(K_list)
K_list = [K_list[i] for i in sorted_indices]
avg_ale_id_list_de = [avg_ale_id_list_de[i] for i in sorted_indices]
avg_epi_id_list_de = [avg_epi_id_list_de[i] for i in sorted_indices]
avg_tot_id_list_de = [avg_tot_id_list_de[i] for i in sorted_indices]
avg_ale_ood_list_de = [avg_ale_ood_list_de[i] for i in sorted_indices]
avg_epi_ood_list_de = [avg_epi_ood_list_de[i] for i in sorted_indices]
avg_tot_ood_list_de = [avg_tot_ood_list_de[i] for i in sorted_indices]
mse_id_list_de = [mse_id_list_de[i] for i in sorted_indices]
mse_ood_list_de = [mse_ood_list_de[i] for i in sorted_indices]

# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Average Uncertainties - ID region
axes[0, 0].plot(K_list, avg_ale_id_list_de, 'o-', label='Aleatoric (ID)', color='green', linewidth=2, markersize=8)
axes[0, 0].plot(K_list, avg_epi_id_list_de, 's-', label='Epistemic (ID)', color='orange', linewidth=2, markersize=8)
axes[0, 0].plot(K_list, avg_tot_id_list_de, '^-', label='Total (ID)', color='blue', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Number of Nets (K)', fontsize=12)
axes[0, 0].set_ylabel('Average Uncertainty', fontsize=12)
axes[0, 0].set_title(f'Deep Ensemble: Average Uncertainties (ID) vs K\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[0, 0].legend(fontsize=10)
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(K_list)

# Plot 2: Average Uncertainties - OOD region
axes[0, 1].plot(K_list, avg_ale_ood_list_de, 'o-', label='Aleatoric (OOD)', color='green', linewidth=2, markersize=8)
axes[0, 1].plot(K_list, avg_epi_ood_list_de, 's-', label='Epistemic (OOD)', color='orange', linewidth=2, markersize=8)
axes[0, 1].plot(K_list, avg_tot_ood_list_de, '^-', label='Total (OOD)', color='blue', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('Number of Nets (K)', fontsize=12)
axes[0, 1].set_ylabel('Average Uncertainty', fontsize=12)
axes[0, 1].set_title(f'Deep Ensemble: Average Uncertainties (OOD) vs K\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[0, 1].legend(fontsize=10)
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(K_list)

# Plot 3: MSE comparison
axes[1, 0].plot(K_list, mse_id_list_de, 'o-', label='MSE (ID)', color='blue', linewidth=2, markersize=8)
axes[1, 0].plot(K_list, mse_ood_list_de, 's-', label='MSE (OOD)', color='red', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('Number of Nets (K)', fontsize=12)
axes[1, 0].set_ylabel('MSE', fontsize=12)
axes[1, 0].set_title(f'Deep Ensemble: MSE vs K\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[1, 0].legend(fontsize=10)
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_yscale('log')
axes[1, 0].set_xticks(K_list)

# Plot 4: ID vs OOD comparison (bar chart)
x_pos = np.arange(len(K_list))
width = 0.35
axes[1, 1].bar(x_pos - width/2, avg_epi_id_list_de, width, label='Epistemic (ID)', color='orange', alpha=0.7)
axes[1, 1].bar(x_pos + width/2, avg_epi_ood_list_de, width, label='Epistemic (OOD)', color='red', alpha=0.7)
axes[1, 1].set_xlabel('Number of Nets (K)', fontsize=12)
axes[1, 1].set_ylabel('Average Epistemic Uncertainty', fontsize=12)
axes[1, 1].set_title(f'Deep Ensemble: Epistemic Uncertainty - ID vs OOD\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[1, 1].set_xticks(x_pos)
axes[1, 1].set_xticklabels(K_list)
axes[1, 1].legend(fontsize=10)
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.suptitle(f'Deep Ensemble Parameter Comparison: Varying K', 
             fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()

# Save plot
save_plot(fig, f"Deep_Ensemble_K_comparison_{function_name}_{noise_type}", 
          subfolder=f"comparisons/{noise_type}/{func_type}")
plt.show()
plt.close(fig)

# Create summary table
comparison_df_de = pd.DataFrame({
    'K': K_list,
    'Avg_Ale_ID': avg_ale_id_list_de,
    'Avg_Epi_ID': avg_epi_id_list_de,
    'Avg_Tot_ID': avg_tot_id_list_de,
    'Avg_Ale_OOD': avg_ale_ood_list_de,
    'Avg_Epi_OOD': avg_epi_ood_list_de,
    'Avg_Tot_OOD': avg_tot_ood_list_de,
    'MSE_ID': mse_id_list_de,
    'MSE_OOD': mse_ood_list_de
})

print("\nSummary Table:")
print(comparison_df_de.to_string(index=False))

# Save comparison table
save_statistics(comparison_df_de, f"Deep_Ensemble_K_comparison_{function_name}_{noise_type}",
                subfolder=f"comparisons/{noise_type}/{func_type}")


## Overall Comparison Summary


In [None]:
# Create a combined summary
print(f"\n{'='*80}")
print("OVERALL COMPARISON SUMMARY")
print(f"{'='*80}\n")

print("MC Dropout - Best Parameters (lowest OOD MSE):")
best_mc_idx = np.argmin(mse_ood_list)
best_mc_samples = mc_samples_list[best_mc_idx]
print(f"  MC Samples: {best_mc_samples}")
print(f"  OOD MSE: {mse_ood_list[best_mc_idx]:.6f}")
print(f"  OOD Epistemic Uncertainty: {avg_epi_ood_list[best_mc_idx]:.6f}")

print("\nDeep Ensemble - Best Parameters (lowest OOD MSE):")
best_de_idx = np.argmin(mse_ood_list_de)
best_K = K_list[best_de_idx]
print(f"  K: {best_K}")
print(f"  OOD MSE: {mse_ood_list_de[best_de_idx]:.6f}")
print(f"  OOD Epistemic Uncertainty: {avg_epi_ood_list_de[best_de_idx]:.6f}")

# Create side-by-side comparison plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# MC Dropout: Epistemic uncertainty comparison
axes[0].plot(mc_samples_list, avg_epi_id_list, 'o-', label='Epistemic (ID)', color='blue', linewidth=2, markersize=8)
axes[0].plot(mc_samples_list, avg_epi_ood_list, 's-', label='Epistemic (OOD)', color='red', linewidth=2, markersize=8)
axes[0].set_xlabel('MC Samples', fontsize=12)
axes[0].set_ylabel('Average Epistemic Uncertainty', fontsize=12)
axes[0].set_title(f'MC Dropout: Epistemic Uncertainty\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(mc_samples_list)

# Deep Ensemble: Epistemic uncertainty comparison
axes[1].plot(K_list, avg_epi_id_list_de, 'o-', label='Epistemic (ID)', color='blue', linewidth=2, markersize=8)
axes[1].plot(K_list, avg_epi_ood_list_de, 's-', label='Epistemic (OOD)', color='red', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Nets (K)', fontsize=12)
axes[1].set_ylabel('Average Epistemic Uncertainty', fontsize=12)
axes[1].set_title(f'Deep Ensemble: Epistemic Uncertainty\n{function_name} Function ({noise_type.capitalize()})', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)
axes[1].set_xticks(K_list)

plt.suptitle('Parameter Comparison: ID vs OOD Epistemic Uncertainty', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()

# Save plot
save_plot(fig, f"Overall_comparison_{function_name}_{noise_type}", 
          subfolder=f"comparisons/{noise_type}/{func_type}")
plt.show()
plt.close(fig)
