# Undersampling Experiments

This notebook runs experiments to evaluate uncertainty quantification methods when training data is non-uniformly distributed (spatially undersampled).

The experiments train models on data with different sampling densities across regions (e.g., well-sampled in the middle, undersampled on the sides) and evaluate uncertainties across these regions.


## Packages


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from typing import Tuple
import sys
from pathlib import Path
from datetime import datetime
import os
import pyro

# Add parent directory to path to import Models
# This works for notebooks in the Experiments folder
project_root = Path.cwd().parent if Path.cwd().name == 'Experiments' else Path.cwd()
sys.path.insert(0, str(project_root))

# Setup results directory
results_dir = project_root / "results" / "undersampling"
results_dir.mkdir(parents=True, exist_ok=True)
plots_dir = results_dir / "plots"
plots_dir.mkdir(exist_ok=True)
stats_dir = results_dir / "statistics"
stats_dir.mkdir(exist_ok=True)
outputs_dir = results_dir / "outputs"
outputs_dir.mkdir(exist_ok=True)

print(f"Results will be saved to: {results_dir}")

# Import from Models folder
from Models.MC_Dropout import (
    MCDropoutRegressor,
    train_model,
    mc_dropout_predict,
    gaussian_nll,
    beta_nll,
    plot_toy_data,
    plot_uncertainties,
    normalize_x,
    normalize_x_data
)

from Models.Deep_Ensemble import (
    train_ensemble_deep,
    ensemble_predict_deep
)

from Models.BNN import (
    train_bnn,
    bnn_predict,
    normalize_x as bnn_normalize_x,
    normalize_x_data as bnn_normalize_x_data
)

from Models.BAMLSS import (
    fit_bamlss,
    bamlss_predict
)

from utils.device import get_device
from utils.plotting import plot_uncertainties_undersampling
import utils.results_save as results_save_module
from utils.results_save import save_plot, save_statistics, save_summary_text, save_summary_statistics
from utils.plotting import plot_data_with_ood_regions

# Import helper functions for undersampling experiments
from utils.undersampling_experiments import (
    run_mc_dropout_undersampling_experiment,
    run_deep_ensemble_undersampling_experiment,
    run_bnn_undersampling_experiment,
    run_bamlss_undersampling_experiment
)

# Set the module-level directories for results_save
results_save_module.plots_dir = plots_dir
results_save_module.stats_dir = stats_dir
results_save_module.outputs_dir = outputs_dir


## Device Setup


In [None]:
device = get_device()


## Generate Toy Datasets


In [None]:
# Reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# ----- Data generation for linear function with homo/heteroscedastic noise -----
# f(x) = 0.7x + 0.5
# noise_type: 'homoscedastic' (σ(x) = 0.20) or 'heteroscedastic' (σ(x) = 0.10 + 0.2(0.5 + 0.5sin(x)))
def generate_toy_regression(n_train=1000, train_range=(-5, 10.0), grid_points=1000, noise_type='heteroscedastic', type = "linear"):
    """
    Generate toy regression data.
    
    Args:
        n_train: Number of training samples
        train_range: Training range tuple (min, max)
        grid_points: Number of grid points for evaluation
        noise_type: 'homoscedastic' or 'heteroscedastic'
        type: 'linear' or 'sin'
    
    Returns:
        (x_train, y_train, x_grid, y_grid_clean)
    """
    # Sample training data uniformly from train_range
    x_train = np.random.uniform(train_range[0], train_range[1], size=(n_train, 1))
    
    if type == "linear":
        # Linear function: f(x) = 0.7x + 0.5
        f_clean = lambda x: 0.7 * x + 0.5
        y_clean_train = f_clean(x_train)
    elif type == "sin":
        f_clean = lambda x:  x * np.sin(x) + x
        y_clean_train = f_clean(x_train)
    else:
        raise ValueError("type must be 'linear', 'sin'")

    # Define noise variance σ²(x)
    if noise_type == 'homoscedastic':
        # Homoscedastic: σ(x) = 2
        sigma = 2
        sigma_train = np.full_like(x_train, sigma)
    elif noise_type == 'heteroscedastic':
        # Heteroscedastic: 
        sigma_train = np.abs(2.5 * np.sin(0.5*x_train +5))
    else:
        raise ValueError("noise_type must be 'homoscedastic' or 'heteroscedastic'")
    
    # Generate noise: ε | x ~ N(0, σ²(x))
    epsilon = np.random.normal(0.0, sigma_train, size=(n_train, 1))
    y_train = y_clean_train + epsilon

    # Dense evaluation grid
    x_grid = np.linspace(train_range[0], train_range[1], grid_points).reshape(-1, 1)
    y_grid_clean = f_clean(x_grid)

    return (x_train.astype(np.float32), y_train.astype(np.float32),
            x_grid.astype(np.float32), y_grid_clean.astype(np.float32))


### Set parameters


In [None]:
n_train = 1000
train_range = (-5, 10)
# Define sampling regions with different densities
# Format: [(region_tuple, density_factor), ...]
# density_factor: 0.0-1.0+ representing relative sampling density
# Example: [((-5, 0), 0.2), ((0, 5), 1.0), ((5, 10), 0.2)]
#   - Regions (-5, 0) and (5, 10) are undersampled (density 0.2)
#   - Region (0, 5) is well-sampled (density 1.0)
sampling_regions = [((-5, 0), 1), ((0, 5), 0.2), ((5, 10), 1)]
grid_points = 1500
seed = 42
torch.manual_seed(seed)


## MC Dropout - Homoscedastic


In [None]:
run_mc_dropout_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='homoscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    p=0.2,
    beta=0.5,
    epochs=700,
    lr=1e-3,
    batch_size=32,
    mc_samples=20,
    entropy_method='analytical'
)


In [None]:
import numpy as np
from pathlib import Path
from Models.BNN import decompose_uncertainty
from utils.entropy_uncertainty import entropy_uncertainty_analytical, entropy_uncertainty_numerical
from utils.plotting import plot_uncertainties_undersampling, plot_uncertainties_entropy_undersampling

# 1. Load the saved outputs
filepath = Path(r"C:\Users\lukas\OneDrive\Desktop\Code-Masterarbeit\A-statistical-evaluation-of-uncertainty-disentanglement-methods-1\results\undersampling\outputs\undersampling\homoscedastic\sin\20260103_MC_Dropout_p0.2_M20_raw_outputs.npz")
data = np.load(filepath, allow_pickle=True)

# 2. Extract arrays
mu_samples = data['mu_samples']  # Shape: [M, N] for MC Dropout, [K, N] for Deep Ensemble, [S, N] for BNN/BAMLSS
sigma2_samples = data['sigma2_samples']  # Same shape
x_grid = data['x_grid']
y_grid_clean = data['y_grid_clean']

# Extract metadata (they're stored as arrays, so use [0] to get the value)
model_name = str(data['model_name'][0]) if 'model_name' in data else None
noise_type = str(data['noise_type'][0]) if 'noise_type' in data else None
func_type = str(data['func_type'][0]) if 'func_type' in data else None

# Optional: Extract training data if saved
x_train = data['x_train_subset'] if 'x_train_subset' in data else None
y_train = data['y_train_subset'] if 'y_train_subset' in data else None

# 3. Recompute variance-based uncertainties
# Use decompose_uncertainty from Models.BNN (works for all models)
mu_pred, ale_var, epi_var, tot_var = decompose_uncertainty(mu_samples, np.sqrt(sigma2_samples))

# 4. Recompute entropy-based uncertainties
entropy_results = entropy_uncertainty_analytical(mu_samples, sigma2_samples)
ale_entropy = entropy_results['aleatoric']
epi_entropy = entropy_results['epistemic']
tot_entropy = entropy_results['total']

# 5. Create plots
# For variance-based uncertainties (if you have region_masks and sampling_regions):
if x_train is not None and y_train is not None:
    # You'll need to define region_masks and sampling_regions based on your experiment
    # For example, if you know the regions:
    sampling_regions = [((-5, 0), 1), ((0, 5), 0.2), ((5, 10), 1)]
    region_masks = []
    for region_tuple, _ in sampling_regions:
        mask = (x_grid[:, 0] >= region_tuple[0]) & (x_grid[:, 0] <= region_tuple[1])
        region_masks.append(mask)
    
    # Plot variance-based uncertainties
    plot_uncertainties_undersampling(
        x_train, y_train, x_grid, y_grid_clean,
        mu_pred, ale_var, epi_var, tot_var, 
        region_masks, sampling_regions,
        title=f"{model_name} - Recomputed - Variance",
        noise_type=noise_type,
        func_type=func_type
    )
    
    # Plot entropy-based uncertainties
    plot_uncertainties_entropy_undersampling(
        x_train, y_train, x_grid, y_grid_clean,
        mu_pred, ale_entropy, epi_entropy, tot_entropy,
        region_masks, sampling_regions,
        title=f"{model_name} - Recomputed - Entropy",
        noise_type=noise_type,
        func_type=func_type
    )

## MC Dropout - Heteroscedastic


In [None]:
run_mc_dropout_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='heteroscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    p=0.2,
    beta=0.5,
    epochs=700,
    lr=1e-3,
    batch_size=32,
    mc_samples=20,
    entropy_method='analytical'
)


## Deep Ensemble - Homoscedastic


In [None]:
run_deep_ensemble_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='homoscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    beta=0.5,
    batch_size=32,
    K=5,
    entropy_method='analytical'
)


## Deep Ensemble - Heteroscedastic


In [None]:
run_deep_ensemble_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='heteroscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    beta=0.5,
    batch_size=32,
    K=5,
    entropy_method='analytical'
)


## BNN - Homoscedastic


In [None]:
run_bnn_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='homoscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    hidden_width=16,
    weight_scale=1.0,
    warmup=200,
    samples=200,
    chains=1,
    entropy_method='analytical'
)


## BNN - Heteroscedastic


In [None]:
run_bnn_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='heteroscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    hidden_width=16,
    weight_scale=1.0,
    warmup=200,
    samples=200,
    chains=1,
    entropy_method='analytical'
)


## BAMLSS - Homoscedastic


In [None]:
run_bamlss_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='homoscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    n_iter=12000,
    burnin=2000,
    thin=10,
    nsamples=1000,
    entropy_method='analytical'
)


## BAMLSS - Heteroscedastic


In [None]:
run_bamlss_undersampling_experiment(
    generate_toy_regression_func=generate_toy_regression,
    function_types=['linear', 'sin'],
    noise_type='heteroscedastic',
    train_range=train_range,
    sampling_regions=sampling_regions,
    n_train=n_train,
    grid_points=grid_points,
    seed=seed,
    n_iter=12000,
    burnin=2000,
    thin=10,
    nsamples=1000,
    entropy_method='analytical'
)
