### _Dataset Creation Sensitivity Analysis - Label Noise_

In [None]:
# Packages
import pandas as pd
import numpy as np

In [None]:
# Data
df = pd.read_csv('data.csv')

In [None]:
# Parameters
RANDOM_SEED = 42
NOISE_LEVELS = [0.5, 1.0, 1.5]
MIN_BRIX = 2
MAX_BRIX = None 

In [None]:
rng = np.random.default_rng(RANDOM_SEED)

# Define corresponding DataFrame names and file name suffixes
df_names = ['df_05noise', 'df_10noise', 'df_15noise']
file_suffixes = ['0_5', '1_0', '1_5']
dfs = {}

# Generate DataFrames with noisy and clipped 'Brix (Position)'
for sigma, name, suffix in zip(NOISE_LEVELS, df_names, file_suffixes):
    df_copy = df.copy()
    noise = rng.normal(loc=0.0, scale=sigma, size=len(df_copy))
    noisy_brix = df_copy['Brix (Position)'] + noise
    df_copy['Brix (Position)'] = np.clip(noisy_brix, MIN_BRIX, MAX_BRIX)
    dfs[name] = df_copy

    # Save to CSV
    df_copy.to_csv(f"/data/data_sensitivityanalysis_noise_{suffix}.csv", index=False)

# Unpack the individual DataFrames
df_05noise = dfs['df_05noise']
df_10noise = dfs['df_10noise']
df_15noise = dfs['df_15noise']

### _Dataset Creation Sensitivity Analysis - Training Set Size_

In [None]:
# Packages
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Data
df = pd.read_csv('data.csv')

In [None]:
# Parameters
RANDOM_SEED = 42

In [None]:
# Filter non-2025 and non-TestVariety rows
df_filtered = df[
    (df['Scan Date Year'] != 2025) &
    (df['Variety'] != 'TestVariety')
]

# Always-include rows (2025 and/or TestVariety)
df_always = df[
    (df['Scan Date Year'] == 2025) |
    (df['Variety'] == 'TestVariety')
]

# Define fractions and output filenames
fractions = [0.8, 0.6]
output_suffixes = ['_80', '_60']

for frac, suffix in zip(fractions, output_suffixes):
    df_sampled, _ = train_test_split(
        df_filtered,
        train_size=frac,
        stratify=df_filtered['Variety'],
        random_state=RANDOM_SEED
    )

    df_subset = pd.concat([df_always, df_sampled], ignore_index=True)

    df_subset.to_csv(
        f'../../data/data_sensitivityanalysis_data_size{suffix}.csv',
        index=False
    )


### _Model Comparison - MCS Procedure_

In [None]:
# Packages
import os
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import resample
from arch.bootstrap import MCS

In [None]:
# Results paths

# Path in which all csv predictions and observations are stored
base_path = "../run/run_results"

# File names for OOC results
model_files_variety = {
    "PLS": "pls_variety.csv",
    "irPLS": "irpls_variety.csv",
    "lwPLS": "lwpls_variety.csv",
    "CNN": "cnn_variety.csv",
    "BayesConv-CNN": "bayesconvcnn_variety.csv",
    "BayesFC-CNN": "bayesfccnn_variety.csv",
    "SpecTransformer": "spectransformer_variety.csv"
}

# File names for OOS results
model_files_season = {
    "PLS": "pls_season.csv",
    "irPLS": "irpls_season.csv",
    "lwPLS": "lwpls_season.csv",
    "CNN": "cnn_season.csv",
    "BayesConv-CNN": "bayesconvcnn_season.csv",
    "BayesFC-CNN": "bayesfccnn_season.csv",
    "SpecTransformer": "spectransformer_season.csv"
}

In [None]:
# Functions
def load_predictions(file_dict, base_path):
    """
    Load prediction files and ensure consistent ground truth values.

    Parameters:
        file_dict : dict
            Mapping of model names to CSV filenames.
        base_path : str
            Directory containing the prediction CSVs.

    Returns:
        y_true_ref : np.ndarray
            Ground truth values (shared across all models).
        preds_dict : dict
            Dictionary mapping model names to their predicted values.
    """
    preds_dict = {}
    y_true_ref = None  # Initialize ground truth reference

    for label, filename in file_dict.items():
        path = os.path.join(base_path, filename)
        df = pd.read_csv(path)

        # Standardize column names
        df.columns = [col.strip().lower() for col in df.columns]

        # Detect column names and extract true and predicted values
        if "true" in df.columns and "predicted" in df.columns:
            y_true = df["true"].values
            y_pred = df["predicted"].values
        elif "observed" in df.columns and "predicted" in df.columns:
            y_true = df["observed"].values
            y_pred = df["predicted"].values
        else:
            raise ValueError(f"{filename} must contain 'True' or 'Observed' with 'Predicted'.")

        # Validate that all models share the same ground truth
        if y_true_ref is None:
            y_true_ref = y_true
        elif not np.allclose(y_true, y_true_ref):
            raise ValueError(f"{filename} has mismatched true values.")

        # Store model predictions
        preds_dict[label] = y_pred

    return y_true_ref, preds_dict

def bootstrap_r2_matrix(y_true, preds_dict, n_bootstrap=5000):
    """
    Compute a bootstrap distribution of R² scores for each model.

    Parameters:
        y_true : np.ndarray
            Ground truth values.
        preds_dict : dict
            Dictionary of model predictions.
        n_bootstrap : int
            Number of bootstrap resamples.

    Returns:
        pd.DataFrame : DataFrame with model names as columns and R² scores as rows.
    """
    model_names = list(preds_dict.keys())
    r2_matrix = {name: [] for name in model_names}
    y_true = np.array(y_true)

    for _ in range(n_bootstrap):
        idx = resample(np.arange(len(y_true)), replace=True)
        for name in model_names:
            r2 = r2_score(y_true[idx], preds_dict[name][idx])
            r2_matrix[name].append(r2)

    return pd.DataFrame(r2_matrix)

def get_r2_mcs(y_true, preds_dict, alpha=0.05, n_bootstrap=5000):
    """
    Perform Model Confidence Set (MCS) analysis using R² as the evaluation metric.

    Parameters:
        y_true : np.ndarray
            Ground truth values.
        preds_dict : dict
            Dictionary of model predictions.
        alpha : float
            Significance level.
        n_bootstrap : int
            Number of bootstrap resamples.

    Returns:
        tuple : (models in MCS, best model, R² bootstrap DataFrame)
    """
    r2_df = bootstrap_r2_matrix(y_true, preds_dict, n_bootstrap=n_bootstrap)
    best_model = r2_df.mean().idxmax()  # Identify model with highest average R²
    mcs_models = []

    for model in r2_df.columns:
        if model == best_model:
            mcs_models.append(model)
            continue

        # Confidence interval of R² difference (best - other)
        diff = r2_df[best_model] - r2_df[model]
        lower, upper = np.percentile(diff, [100 * alpha / 2, 100 * (1 - alpha / 2)])

        # Include model if its R² is not significantly worse than the best
        if lower <= 0 <= upper:
            mcs_models.append(model)

    return mcs_models, best_model, r2_df

def get_loss_matrix(y_true, preds_dict, metric):
    """
    Compute the loss matrix based on the chosen evaluation metric.

    Parameters:
        y_true : np.ndarray
            Ground truth values.
        preds_dict : dict
            Dictionary of model predictions.
        metric : str
            Evaluation metric: 'rmsep', 'practical_accuracy', or 'r2'.

    Returns:
        pd.DataFrame : Loss matrix (None if metric is 'r2').
    """
    losses = {}
    y_true = np.array(y_true)

    if metric == "rmsep":
        for name, preds in preds_dict.items():
            losses[name] = (y_true - preds) ** 2  # Squared error per sample
        return pd.DataFrame(losses)

    elif metric == "practical_accuracy":
        for name, preds in preds_dict.items():
            # Binary accuracy within ±20% range, then convert to loss
            acc = (np.abs(y_true - preds) <= 0.2 * np.abs(y_true)).astype(int)
            losses[name] = 1 - acc
        return pd.DataFrame(losses)

    elif metric == "r2":
        return None  # Handled separately using bootstrap R²

    else:
        raise ValueError("Metric must be one of: 'r2', 'rmsep', 'practical_accuracy'")

def run_mcs(
    y_true,
    preds_dict,
    metric,
    alpha=0.05,
    reps=5000
):
    """
    Perform Model Confidence Set (MCS) analysis using specified performance metric.

    Parameters:
        y_true : np.ndarray
            Ground truth values.
        preds_dict : dict
            Dictionary of model predictions.
        metric : str
            Evaluation metric: 'rmsep', 'practical_accuracy', or 'r2'.
        alpha : float
            Significance level.
        reps : int
            Number of bootstrap or permutation repetitions.

    Returns:
        list : Model names included in the MCS.
    """
    if metric == "r2":
        models, best, _ = get_r2_mcs(y_true, preds_dict, alpha=alpha, n_bootstrap=reps)
        return models
    else:
        # Compute sample-level losses and run external MCS test
        loss_matrix = get_loss_matrix(y_true, preds_dict, metric)
        mcs = MCS(loss_matrix.values, size=alpha, reps=reps)
        mcs.compute()
        return loss_matrix.columns[mcs.included].tolist()


In [None]:
# Run
results = {}

# Loop over both dataset splits: 'Variety' and 'Season'
for label, model_files in {"Variety": model_files_variety, "Season": model_files_season}.items():
    
    # Load ground truth and prediction dictionaries
    y_true, preds_dict = load_predictions(model_files, base_path)
    results[label] = {}

    # Run MCS for each metric
    for metric in ["rmsep", "r2", "practical_accuracy"]:
        print(f"\nRunning MCS for {label} - {metric.upper()}")

        # Compute Model Confidence Set (MCS)
        mcs_models = run_mcs(y_true, preds_dict, metric, alpha=0.05, reps=5000)
        results[label][metric] = mcs_models

        print(f"Models in 95% MCS ({label}, {metric}): {mcs_models}")

# Print summary
print("\nSummary of MCS Results:")
for dataset in results:
    for metric in results[dataset]:
        print(f"{dataset} - {metric.upper()}: {results[dataset][metric]}")


### _Uncertainty Estimation - 100 MC Forward Passes_

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

# === Load model predictions and uncertainties ===
base_path = "../run/run_results"

# Load BayesConv-CNN and BayesFC-CNN results
df_bayesconv_season  = pd.read_csv(os.path.join(base_path, "bayesconvcnn_season.csv"))
df_bayesconv_variety = pd.read_csv(os.path.join(base_path, "bayesconvcnn_variety.csv"))
df_bayesfc_season    = pd.read_csv(os.path.join(base_path, "bayesfccnn_season.csv"))
df_bayesfc_variety   = pd.read_csv(os.path.join(base_path, "bayesfccnn_variety.csv"))

# Load all model predictions for variety split
model_dfs_variety = {
    "pls": pd.read_csv(os.path.join(base_path, "pls_variety.csv")),
    "irpls": pd.read_csv(os.path.join(base_path, "irpls_variety.csv")),
    "lwpls": pd.read_csv(os.path.join(base_path, "lwpls_variety.csv")),
    "cnn": pd.read_csv(os.path.join(base_path, "cnn_variety.csv")),
    "bayesconv": df_bayesconv_variety,
    "bayesfc": df_bayesfc_variety,
    "spectransformer": pd.read_csv(os.path.join(base_path, "spectransformer_variety.csv"))
}

# Load all model predictions for season split
model_dfs_season = {
    "pls": pd.read_csv(os.path.join(base_path, "pls_season.csv")),
    "irpls": pd.read_csv(os.path.join(base_path, "irpls_season.csv")),
    "lwpls": pd.read_csv(os.path.join(base_path, "lwpls_season.csv")),
    "cnn": pd.read_csv(os.path.join(base_path, "cnn_season.csv")),
    "bayesconv": df_bayesconv_season,
    "bayesfc": df_bayesfc_season,
    "spectransformer": pd.read_csv(os.path.join(base_path, "spectransformer_season.csv"))
}

# Human-readable labels for plotting
label_map = {
    "pls": "PLS",
    "irpls": "irPLS",
    "lwpls": "lwPLS",
    "cnn": "CNN",
    "bayesconv": "BayesConv-CNN",
    "bayesfc": "BayesFC-CNN",
    "spectransformer": "SpecTransformer"
}

# List of models to include in plots
models_to_plot = list(label_map.keys())

# === Spearman correlation test between uncertainty and performance metric ===
def test_spearman_correlation_between_uncertainty_and_metric(
    metric_results,
    fractions,
    metric_name
):
    print(f"\n=== Spearman Correlation: Uncertainty vs. {metric_name.upper()} ===")
    for model, values in metric_results.items():
        r, p = spearmanr(fractions, values)
        direction = "increasing" if r > 0 else "decreasing"
        significance = "significant" if p < 0.05 else "not significant"
        print(f"{model:<15} | Spearman r = {r:>6.3f} | p = {p:>7.4f} | {direction}, {significance}")

# === Evaluate model performance over uncertain subsets and plot ===
def evaluate_and_plot_all_models_by_uncertainty(
    df_uncertainty,
    model_dfs,
    model_names,
    model_label_map,
    test_type,
    backbone_model_name
):
    # Preprocess uncertainty DataFrame and compute standard deviation across MC passes
    df_uncertainty.columns = [c.lower().strip() for c in df_uncertainty.columns]
    mc_cols = [c for c in df_uncertainty.columns if c.startswith("mc_pass_")]
    mc_preds = df_uncertainty[mc_cols].values
    mc_std = np.std(mc_preds, axis=1)  # Uncertainty per sample
    sorted_indices = np.argsort(-mc_std)  # Sort by decreasing uncertainty

    # Load ground truth values from reference model
    df_ref = model_dfs[model_names[0]].copy()
    df_ref.columns = [c.lower().strip() for c in df_ref.columns]
    y_true = df_ref["true"].values if "true" in df_ref.columns else df_ref["observed"].values

    # Initialize evaluation results for each model
    fractions = np.linspace(0.01, 1.0, 100)
    rmsep_results = {model: [] for model in model_names}
    acc_results = {model: [] for model in model_names}

    # Loop over increasing fractions of most uncertain samples
    for frac in fractions:
        k = int(len(sorted_indices) * frac)
        subset = sorted_indices[:k]

        # Evaluate each model on selected subset
        for model in model_names:
            df = model_dfs[model].copy()
            df.columns = [c.lower().strip() for c in df.columns]
            y = df["true"].values if "true" in df.columns else df["observed"].values
            y_pred = df["predicted"].values

            y_sub = y[subset]
            p_sub = y_pred[subset]

            rmsep = np.sqrt(mean_squared_error(y_sub, p_sub))
            acc = np.mean(np.abs(p_sub - y_sub) <= 0.2 * y_sub)

            rmsep_results[model].append(rmsep)
            acc_results[model].append(acc)

    # === Plot RMSEP over uncertainty fractions ===
    print(f"\n--> Plotting RMSEP for {backbone_model_name} ({test_type})")
    plt.figure(figsize=(5.5, 3))
    for model in model_names:
        plt.plot(fractions * 100, rmsep_results[model], label=model_label_map[model])
    plt.xlabel("Top % Most Uncertain Samples", fontsize=9)
    plt.ylabel("RMSEP", fontsize=9)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.legend(fontsize=8, frameon=False)
    plt.tight_layout()
    plt.show()

    # Correlation between uncertainty and RMSEP
    test_spearman_correlation_between_uncertainty_and_metric(
        metric_results=rmsep_results,
        fractions=fractions,
        metric_name="rmsep"
    )

    # === Plot Practical Accuracy over uncertainty fractions ===
    print(f"\n--> Plotting Practical Accuracy for {backbone_model_name} ({test_type})")
    plt.figure(figsize=(5.5, 3))
    for model in model_names:
        plt.plot(fractions * 100, acc_results[model], label=model_label_map[model])
    plt.xlabel("Top % Most Uncertain Samples", fontsize=9)
    plt.ylabel("Practical Accuracy", fontsize=9)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.legend(fontsize=8, frameon=False)
    plt.tight_layout()
    plt.show()

    # Correlation between uncertainty and accuracy
    test_spearman_correlation_between_uncertainty_and_metric(
        metric_results=acc_results,
        fractions=fractions,
        metric_name="accuracy"
    )

# === Run evaluation for variety split ===
evaluate_and_plot_all_models_by_uncertainty(
    df_uncertainty=df_bayesfc_variety,
    model_dfs=model_dfs_variety,
    model_names=models_to_plot,
    model_label_map=label_map,
    test_type="variety",
    backbone_model_name="BayesFC-CNN"
)

# === Run evaluation for season split ===
evaluate_and_plot_all_models_by_uncertainty(
    df_uncertainty=df_bayesfc_season,
    model_dfs=model_dfs_season,
    model_names=models_to_plot,
    model_label_map=label_map,
    test_type="season",
    backbone_model_name="BayesFC-CNN"
)


### _Uncertainty Estimation - 10 MC Forward Passes_

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

# === Load model predictions and BayesFC-CNN uncertainties ===
base_path = "../run/run_results"

# Load prediction files with Monte Carlo dropout results
df_bayesfc_variety = pd.read_csv(os.path.join(base_path, "bayesfccnn_variety.csv"))
df_bayesfc_season  = pd.read_csv(os.path.join(base_path, "bayesfccnn_season.csv"))

# Load all model predictions (variety split)
model_dfs_variety = {
    "pls": pd.read_csv(os.path.join(base_path, "pls_variety.csv")),
    "irpls": pd.read_csv(os.path.join(base_path, "irpls_variety.csv")),
    "lwpls": pd.read_csv(os.path.join(base_path, "lwpls_variety.csv")),
    "cnn": pd.read_csv(os.path.join(base_path, "cnn_variety.csv")),
    "bayesconv": pd.read_csv(os.path.join(base_path, "bayesconvcnn_variety.csv")),
    "bayesfc": df_bayesfc_variety,
    "spectransformer": pd.read_csv(os.path.join(base_path, "spectransformer_variety.csv"))
}

# Load all model predictions (season split)
model_dfs_season = {
    "pls": pd.read_csv(os.path.join(base_path, "pls_season.csv")),
    "irpls": pd.read_csv(os.path.join(base_path, "irpls_season.csv")),
    "lwpls": pd.read_csv(os.path.join(base_path, "lwpls_season.csv")),
    "cnn": pd.read_csv(os.path.join(base_path, "cnn_season.csv")),
    "bayesconv": pd.read_csv(os.path.join(base_path, "bayesconvcnn_season.csv")),
    "bayesfc": df_bayesfc_season,
    "spectransformer": pd.read_csv(os.path.join(base_path, "spectransformer_season.csv"))
}

# Human-readable model labels
label_map = {
    "pls": "PLS",
    "irpls": "irPLS",
    "lwpls": "lwPLS",
    "cnn": "CNN",
    "bayesconv": "BayesConv-CNN",
    "bayesfc": "BayesFC-CNN",
    "spectransformer": "SpecTransformer"
}

# Models to include in analysis
models_to_plot = list(label_map.keys())

# === Bootstrapped Evaluation Function ===
def run_bootstrapped_uncertainty_eval(
    df_uncertainty,
    model_dfs,
    model_names,
    metric: str,
    n_mc_passes: int = 10,
    n_bootstrap: int = 5000
):
    # Preprocess column names and extract MC predictions
    df_uncertainty.columns = [c.lower().strip() for c in df_uncertainty.columns]
    mc_cols_all = [c for c in df_uncertainty.columns if c.startswith("mc_pass_")]
    mc_preds_all = df_uncertainty[mc_cols_all].values

    n_samples = mc_preds_all.shape[0]
    fractions = np.linspace(0.01, 1.0, 100)
    metric_curves = {model: np.zeros((n_bootstrap, len(fractions))) for model in model_names}

    # Load ground truth values from reference model
    df_ref = model_dfs[model_names[0]].copy()
    df_ref.columns = [c.lower().strip() for c in df_ref.columns]
    y_true = df_ref["true"].values if "true" in df_ref.columns else df_ref["observed"].values

    # Perform bootstrapping
    for b in range(n_bootstrap):
        # Randomly sample MC passes
        sampled_cols = np.random.choice(mc_preds_all.shape[1], size=n_mc_passes, replace=False)
        mc_sample = mc_preds_all[:, sampled_cols]

        # Compute uncertainty and sort indices
        uncertainty_std = np.std(mc_sample, axis=1)
        sorted_indices = np.argsort(-uncertainty_std)

        for i, frac in enumerate(fractions):
            k = int(n_samples * frac)
            subset = sorted_indices[:k]

            for model in model_names:
                df = model_dfs[model].copy()
                df.columns = [c.lower().strip() for c in df.columns]
                y = df["true"].values if "true" in df.columns else df["observed"].values
                y_pred = df["predicted"].values

                y_sub = y[subset]
                p_sub = y_pred[subset]

                # Compute metric (RMSEP or practical accuracy)
                if metric == "rmsep":
                    value = np.sqrt(mean_squared_error(y_sub, p_sub))
                elif metric == "accuracy":
                    value = np.mean(np.abs(p_sub - y_sub) <= 0.2 * y_sub)
                else:
                    raise ValueError("Metric must be 'rmsep' or 'accuracy'.")

                metric_curves[model][b, i] = value

    return metric_curves, fractions

# === Spearman correlation evaluation ===
def compute_bootstrap_spearman_correlations(
    metric_curves: dict,
    fractions: np.ndarray,
    metric_name: str
):
    print(f"\n=== Bootstrap Spearman Correlation: Uncertainty vs. {metric_name.upper()} ===")
    for model, runs in metric_curves.items():
        spearman_rs = []
        p_values = []

        # Compute Spearman correlation per bootstrap run
        for run in runs:
            r, p = spearmanr(fractions, run)
            spearman_rs.append(r)
            p_values.append(p)

        # Summary statistics
        spearman_rs = np.array(spearman_rs)
        p_values = np.array(p_values)
        r_mean = spearman_rs.mean()
        r_std = spearman_rs.std()
        p_05 = np.mean(p_values < 0.05)
        p_01 = np.mean(p_values < 0.01)
        p_001 = np.mean(p_values < 0.001)

        print(f"{model:<15} | r = {r_mean:>6.3f} ± {r_std:>5.3f} | p < .05: {p_05:.1%} | p < .01: {p_01:.1%} | p < .001: {p_001:.1%}")

# === Plotting function ===
def plot_bootstrap_metric(
    metric_curves,
    fractions,
    title,
    ylabel
):
    print(f"--> Plotting: {title}")
    plt.figure(figsize=(5.5, 3))
    for model in models_to_plot:
        mean_vals = metric_curves[model].mean(axis=0)
        plt.plot(fractions * 100, mean_vals, label=label_map[model])
    plt.xlabel("Top % Most Uncertain Samples", fontsize=9)
    plt.ylabel(ylabel, fontsize=9)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)

    # Only show legend on one key plot
    if "RMSEP" in title and "Variety" in title:
        plt.legend(fontsize=8, frameon=False)

    plt.tight_layout()
    plt.show()

# === Run evaluations for variety ===
rmsep_variety, fractions = run_bootstrapped_uncertainty_eval(
    df_uncertainty=df_bayesfc_variety,
    model_dfs=model_dfs_variety,
    model_names=models_to_plot,
    metric="rmsep"
)

acc_variety, _ = run_bootstrapped_uncertainty_eval(
    df_uncertainty=df_bayesfc_variety,
    model_dfs=model_dfs_variety,
    model_names=models_to_plot,
    metric="accuracy"
)

# === Run evaluations for season ===
rmsep_season, _ = run_bootstrapped_uncertainty_eval(
    df_uncertainty=df_bayesfc_season,
    model_dfs=model_dfs_season,
    model_names=models_to_plot,
    metric="rmsep"
)

acc_season, _ = run_bootstrapped_uncertainty_eval(
    df_uncertainty=df_bayesfc_season,
    model_dfs=model_dfs_season,
    model_names=models_to_plot,
    metric="accuracy"
)

# === Plot all metrics ===
plot_bootstrap_metric(rmsep_variety, fractions, "RMSEP vs. Uncertainty (Variety, 10 MC, 1000 Bootstraps)", "RMSEP")
plot_bootstrap_metric(acc_variety,  fractions, "Practical Accuracy vs. Uncertainty (Variety, 10 MC, 1000 Bootstraps)", "Practical Accuracy")
plot_bootstrap_metric(rmsep_season, fractions, "RMSEP vs. Uncertainty (Season, 10 MC, 1000 Bootstraps)", "RMSEP")
plot_bootstrap_metric(acc_season,  fractions, "Practical Accuracy vs. Uncertainty (Season, 10 MC, 1000 Bootstraps)", "Practical Accuracy")

# === Print summary of uncertainty-performance correlation ===
compute_bootstrap_spearman_correlations(rmsep_variety, fractions, "rmsep")
compute_bootstrap_spearman_correlations(acc_variety,  fractions, "accuracy")
compute_bootstrap_spearman_correlations(rmsep_season, fractions, "rmsep")
compute_bootstrap_spearman_correlations(acc_season,  fractions, "accuracy")
