In [1]:
import sys
import os
from PIL import Image

# Add the absolute path to the root directory of the project
sys.path.append("/cs/cs_groups/cliron_group/Calibrato")
import time
import io

import numpy as np
import tensorflow as tf
import argparse
import logging
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from calibrators.geometric_calibrators import GeometricCalibrator, GeometricCalibratorTrust
from utils.logging_config import setup_logging
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import torch
from torchvision.models import DenseNet
import joblib
from models.model_factory import get_model  # Import the factory function
from tqdm import tqdm
import pandas as pd
import time
from utils.metrics import CalibrationMetrics
from utils.utils import StabilitySpace, Compression
import random
from skimage.transform import rotate
from scipy.ndimage import shift
# from calibrators.calibrators import *
from calibrators.ensemble_calibrators import *
from calibrators.non_parametric_calibrators import *
from calibrators.parametric_calibrators import *
from calibrators.specialized_calibrators import *
from calibrators.trust_score_calibration import *
from sklearn.calibration import CalibratedClassifierCV
from filelock import FileLock


2024-12-10 09:22:19.438760: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-10 09:22:19.463538: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-10 09:22:19.471153: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-10 09:22:19.490623: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# Initialize logging
setup_logging()
logger = logging.getLogger(__name__)

def transform_test_set(X_test, transform_ratios=(0.3, 0.3, 0.4), random_state=None):
    """
    Apply transformations to the test set: rotate, shift, and add noise.

    Args:
        X_test (numpy.ndarray): Test set images.
        transform_ratios (tuple): Ratios for (rotation, shift, noise).
        random_state (int): Seed for reproducibility.

    Returns:
        numpy.ndarray: Transformed test set.
    """
    if random_state is not None:
        np.random.seed(random_state)

    n_samples = len(X_test)
    n_rotate = int(n_samples * transform_ratios[0])
    n_shift = int(n_samples * transform_ratios[1])
    n_noise = n_samples - n_rotate - n_shift

    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    # Get subsets
    rotate_indices = indices[:n_rotate]
    shift_indices = indices[n_rotate:n_rotate + n_shift]
    noise_indices = indices[n_rotate + n_shift:]

    # Apply transformations
    transformed_X_test = X_test.copy()

    # Rotation
    for idx in rotate_indices:
        angle = random.uniform(-30, 30)  # Rotate by a random angle between -30 and 30 degrees
        transformed_X_test[idx] = rotate(transformed_X_test[idx], angle, mode='wrap')

    # Shifting
    for idx in shift_indices:
        shift_x = random.uniform(-5, 5)  # Shift up to ±5 pixels
        shift_y = random.uniform(-5, 5)
        transformed_X_test[idx] = shift(transformed_X_test[idx], shift=(shift_x, shift_y, 0), mode='wrap')

    # Noise
    for idx in noise_indices:
        noise = np.random.random(transformed_X_test[idx].shape) * 0.4  # Add up to 40% noise
        transformed_X_test[idx] = np.clip(transformed_X_test[idx] + noise, 0, 1)

    return transformed_X_test


def build_densenet40_model(num_classes=10):
    model = DenseNet(
        growth_rate=12,
        block_config=(6, 6, 6),
        num_init_features=24,
        bn_size=4,
        drop_rate=0,
        num_classes=num_classes
    )
    return model


def load_and_split_data(dataset_name, random_state):
    """
    Load and split data into train, validation, and test sets.
    
    Args:
        dataset_name (str): Name of the dataset to load ('MNIST', 'Fashion MNIST', 'CIFAR-10', 'CIFAR-100', 'Tiny ImageNet').
        random_state (int): Random state for reproducibility.
    
    Returns:
        Tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    logging.info(f"Loading dataset: {dataset_name}")
    
    # Load dataset based on the given name
    if dataset_name.lower() == "mnist":
        (train_X_original, train_y_original), (test_X_original, test_y_original) = tf.keras.datasets.mnist.load_data()
        input_shape = (28, 28, 1)
    
    elif dataset_name.lower() == "fashion_mnist":
        (train_X_original, train_y_original), (test_X_original, test_y_original) = tf.keras.datasets.fashion_mnist.load_data()
        input_shape = (28, 28, 1)
    
    elif dataset_name.lower() == "cifar10":
        (train_X_original, train_y_original), (test_X_original, test_y_original) = tf.keras.datasets.cifar10.load_data()
        input_shape = (32, 32, 3)
    
    elif dataset_name.lower() == "cifar100":
        (train_X_original, train_y_original), (test_X_original, test_y_original) = tf.keras.datasets.cifar100.load_data()
        input_shape = (32, 32, 3)

    elif dataset_name.lower() == "tiny_imagenet":
        logging.info("Loading Tiny ImageNet dataset")
        splits = {
            'train': 'data/train-00000-of-00001-1359597a978bc4fa.parquet',
            'valid': 'data/valid-00000-of-00001-70d52db3c749a935.parquet'
        }
        
        # Load training and validation data
        train_df = pd.read_parquet("hf://datasets/zh-plus/tiny-imagenet/" + splits['train'])
        valid_df = pd.read_parquet("hf://datasets/zh-plus/tiny-imagenet/" + splits['valid'])
        
        logging.info(f"Train DataFrame head:\n{train_df.head()}")
        logging.info(f"Validation DataFrame head:\n{valid_df.head()}")
        
        # Decode images from binary data
        def decode_image(row):
            binary_data = row['bytes']  # Extract binary data
            try:
                image = Image.open(io.BytesIO(binary_data))  # Decode the image
                image = image.convert("RGB")  # Ensure RGB
                image = image.resize((64, 64))  # Resize to 64x64
                return np.array(image)  # Convert to numpy array
            except Exception as e:
                logging.error(f"Failed to decode or resize image: {e}")
                return None

        # Decode and filter valid images
        train_X_original = np.stack([
            img for img in (decode_image(img) for img in train_df['image']) if img is not None
        ])
        test_X_original = np.stack([
            img for img in (decode_image(img) for img in valid_df['image']) if img is not None
        ])
        logging.info(f"Shape of train_X_original: {train_X_original.shape}")
        logging.info(f"Shape of test_X_original: {test_X_original.shape}")

        train_y_original = np.array(train_df['label'])
        test_y_original = np.array(valid_df['label'])

        # Set random seed for reproducibility (optional)
        np.random.seed(random_state)

        # Calculate the number of samples for half the dataset
        num_samples_train = train_X_original.shape[0] // 4
        num_samples_test = test_X_original.shape[0] // 2

        # Generate random indices to sample
        random_indices_train = np.random.choice(train_X_original.shape[0], size=num_samples_train, replace=False)
        random_indices_test = np.random.choice(test_X_original.shape[0], size=num_samples_test, replace=False)

        # Sample the images and labels using the random indices
        train_X_original = train_X_original[random_indices_train]
        train_y_original = train_y_original[random_indices_train]
        test_X_original = test_X_original[random_indices_test]
        test_y_original = test_y_original[random_indices_test]

        logging.info(f"Shape of sampled train_X: {train_X_original.shape}, train_y: {train_y_original.shape}")
        logging.info(f"Shape of sampled test_X: {test_X_original.shape}, test_y: {test_y_original.shape}")

        
        input_shape = (64, 64, 3)  # Tiny ImageNet images are 64x64 RGB
        logging.info("Tiny ImageNet loaded successfully")


    elif dataset_name.lower() == "GTSRB":
        # Placeholder for loading GTSRB (German Traffic Sign Benchmark)
        # Replace this with actual GTSRB loading code, e.g., from a local or custom dataset loader.
        # Example:
        # (train_X_original, train_y_original), (test_X_original, test_y_original) = load_GTSRB()
        raise NotImplementedError("GTSRB dataset loading not implemented. Use an appropriate data loader.")
    
    else:
        raise ValueError(f"Dataset '{dataset_name}' not recognized. Choose from 'MNIST', 'Fashion MNIST', 'CIFAR-10', 'CIFAR-100', or 'GTRSB'.")

    logging.info("Combining and splitting data")
    
    # Combine train and test data for further splitting
    data = np.concatenate((train_X_original, test_X_original), axis=0)
    labels = np.concatenate((train_y_original, test_y_original), axis=0).squeeze()  # Ensure labels are 1D for compatibility
    logging.debug(f"Data shape: {data.shape}, Labels shape: {labels.shape}")

    # Split data into train, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)
    logging.debug(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

    # Expand dimensions if necessary for grayscale images
    if input_shape[-1] == 1:  # Grayscale datasets (MNIST, Fashion MNIST)
        logging.info("Expanding dimensions for grayscale images")
        X_train = np.expand_dims(X_train, axis=-1)
        X_test = np.expand_dims(X_test, axis=-1)
        X_val = np.expand_dims(X_val, axis=-1)

    # Ensure data is cast to float32 for neural network compatibility
    X_train, X_val, X_test = X_train.astype("float32"), X_val.astype("float32"), X_test.astype("float32")
    X_train /= 255.0
    X_val /= 255.0
    X_test /= 255.0
    logging.info("Data normalization completed")

    return X_train, X_val, X_test, y_train, y_val, y_test



# Adjusted function to train or load the specified model type
def train_or_load_model(X_train, y_train, X_val, y_val, dataset_name, random_state, model_type="cnn", file_format="keras", epochs=None, epochs_dict=None):
    """
    Train or load a specified model type, with dynamic number of epochs based on dataset size.
    
    Args:
        X_train: Training data.
        y_train: Training labels.
        X_val: Validation data.
        y_val: Validation labels.
        dataset_name (str): Name of the dataset, used for directory structure.
        random_state (int): Random state number, used for directory structure.
        model_type (str): Type of model (e.g., "cnn").
        file_format (str): File format for saving model, options are "keras" or "h5".
        epochs (int, optional): If specified, use this number of epochs. Otherwise, it is dynamically calculated.
        epochs_dict (dict, optional): Dictionary mapping dataset names to specific epochs.
    
    Returns:
        model: Trained or loaded model.
    """
    model_directory = f"{dataset_name}/{random_state}/saved_models"
    model_path = os.path.join(model_directory, f"{model_type}_model.{file_format}")
    lock_path = f"{model_path}.lock"
    num_classes = len(np.unique(y_train))
    input_shape = X_train.shape[1:]

    # Determine the number of epochs
    if epochs_dict and dataset_name.lower() in epochs_dict:
        epochs = epochs_dict[dataset_name.lower()]
    elif epochs is None:
        # Dynamically calculate epochs based on the dataset size
        epochs = max(1, len(X_train) // 800)  # At least 1 epoch for very small datasets
    epochs = int(epochs)  # Ensure epochs is an integer

    logger.info(f"Using {epochs} epochs for training.")

    # Preprocess data for ResNet if needed
    if model_type == "pretrained_resnet":
        batch = 16
    else:
        batch = 32
        
    # Use a file lock to handle concurrent access
    with FileLock(lock_path):
        # Check if the model already exists
        if os.path.exists(model_path):
            logger.info(f"Loading pre-trained {model_type} model from {model_path}.")
            if model_type in ["cnn", "pretrained_resnet"]:
                model = tf.keras.models.load_model(model_path)
            else:
                model = joblib.load(model_path)
        else:
            # Train the model
            logger.info(f"Training new {model_type} model.")
            model = get_model(dataset_name, model_type, input_shape=input_shape, num_classes=num_classes)

            if model_type in ["cnn", "pretrained_resnet"]:
                model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch, verbose=2)
                os.makedirs(model_directory, exist_ok=True)
                model.save(model_path)
            else:
                model.fit(X_train.reshape(X_train.shape[0], -1), y_train)
                os.makedirs(model_directory, exist_ok=True)
                joblib.dump(model, model_path)

            logger.info(f"Model saved at {model_path}.")
    
    return model



def calibrate_with_geometric(model, X_train, y_train, X_val, y_val, X_test, y_test, library, metric='l2'):
    """
    Apply geometric calibration with the specified library (FAISS, KNN, or separation).
    
    Args:
        model: The model to calibrate
        X_train, y_train: Training data
        X_val, y_val: Validation data
        X_test, y_test: Test data
        library: Library to use for stability calculation
        metric: Distance metric to use (default: 'l2')
    """
    geo_calibrator = GeometricCalibrator(
        model=model, 
        X_train=X_train, 
        y_train=y_train, 
        library=library,
        metric=metric
    )
    geo_calibrator.fit(X_val, y_val)

    # Calibrate the test set
    calibrated_probs = geo_calibrator.calibrate(X_test)
    y_test_pred = np.argmax(calibrated_probs, axis=1)
    accuracy = accuracy_score(y_test, y_test_pred)

    logger.info(f"Accuracy after calibration with {library} using {metric} metric: {accuracy}")

    return calibrated_probs, y_test_pred


def calculate_ece(probs, y_pred, y_true, n_bins=20):
    """
    Calculate Expected Calibration Error (ECE).
    """
    confidence_of_pred_class = np.max(probs, axis=1)
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_indices = np.digitize(confidence_of_pred_class, bin_boundaries) - 1

    total_error = 0.0
    for i in range(n_bins):
        bin_mask = bin_indices == i
        bin_confidences = confidence_of_pred_class[bin_mask]
        bin_real = y_true[bin_mask]
        bin_pred = y_pred[bin_mask]

        if len(bin_confidences) > 0:
            bin_acc = np.mean(bin_real == bin_pred)
            bin_conf = np.mean(bin_confidences)
            bin_weight = len(bin_confidences) / len(probs)
            total_error += bin_weight * np.abs(bin_acc - bin_conf)

    logger.info(f"Final ECE value: {total_error}")
    return total_error


def initialize_directories(base_dir, transformed, dataset_name, random_state, model_type, metric, trust_alpha):
    if transformed:
        base_dir = base_dir + "/transformed"
    os.makedirs(base_dir, exist_ok=True)

    technique_dirs = {
        "faiss_exact": os.path.join(base_dir, "faiss_exact"),
        "knn": os.path.join(base_dir, "knn"),
        "separation": os.path.join(base_dir, "separation"),
        "isotonic": os.path.join(base_dir, "isotonic"),
        "platt": os.path.join(base_dir, "platt"),
        "temperature": os.path.join(base_dir, "temperature"),
        "trust_score_filtered": os.path.join(base_dir, f"trust_score_filtered"),
        "trust_score_unfiltered": os.path.join(base_dir, f"trust_score_unfiltered"),
    }

    all_results_dir = os.path.join(base_dir, "all")
    os.makedirs(all_results_dir, exist_ok=True)

    for directory in technique_dirs.values():
        os.makedirs(directory, exist_ok=True)

    return base_dir, technique_dirs, all_results_dir


def prepare_data(dataset_name, random_state, transformed):
    """Load and optionally transform dataset."""
    X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(dataset_name, random_state)
    if transformed:
        X_test = transform_test_set(X_test)
    return X_train, X_val, X_test, y_train, y_val, y_test


def prepare_model(X_train, y_train, X_val, y_val, dataset_name, random_state, model_type, dataset_epochs):
    """Train or load the model."""
    return train_or_load_model(
        X_train, y_train, X_val, y_val,
        dataset_name=dataset_name,
        random_state=random_state,
        model_type=model_type,
        epochs_dict=dataset_epochs
    )

def calibrate_geometric(model, X_train, y_train, X_val, y_val, X_test, y_test, calibrator, name, technique_dirs, metric, use_binning, compression=None):
    """
    Perform geometric calibrations (FAISS, KNN, Separation) with optional binning.
    """
    try:
        start_time = time.time()

        # Apply compression if provided
        stability_space = StabilitySpace(
            X_train, 
            y_train, 
            compression=compression,
            library=calibrator["library"],
            faiss_mode=calibrator.get("mode"), 
            metric=metric
        )

        geo_calibrator = GeometricCalibrator(
            model=model,
            X_train=X_train,
            y_train=y_train,
            stability_space=stability_space,
            library=calibrator["library"],
            metric=metric,
            use_binning=use_binning,  # Enable or disable binning
        )
        geo_calibrator.fit(X_val, y_val)
        calibrated_probs = geo_calibrator.calibrate(X_test)
        y_test_pred_cal = np.argmax(calibrated_probs, axis=1)

        metrics = CalibrationMetrics(calibrated_probs, y_test_pred_cal, y_test, n_bins=20)
        metrics_dict = metrics.calculate_all_metrics()
        calibration_time = time.time() - start_time

        results = {
            "Metric": f"{name.replace('_', ' ').capitalize()}",
            **metrics_dict,
            "Calibration Time (s)": calibration_time,
        }

        results_csv_file = os.path.join(technique_dirs[name], "results.csv")
        pd.DataFrame([results]).to_csv(results_csv_file, index=False)
        print(f"{name.replace('_', ' ').capitalize()} Metrics saved to {results_csv_file}")
        return results
    except Exception as e:
        logger.error(f"Error calibrating with {name}: {e}")
        return None
    
    
def calibrate_parametric(features_val, y_val, features_test, y_test, calibrator, name, technique_dirs):
    try:
        start_time = time.time()
        logger.info(f"Starting parametric calibration with {name}.")
        calibrator.fit(features_val, y_val)
        calibrated_probs = calibrator.calibrate(features_test)

        # Ensure calibrated_probs is 2D
        if calibrated_probs.ndim == 1:
            # Convert to 2D array with two classes (binary classification)
            calibrated_probs = np.vstack([1 - calibrated_probs, calibrated_probs]).T

        y_test_pred_cal = np.argmax(calibrated_probs, axis=1)

        metrics = CalibrationMetrics(calibrated_probs, y_test_pred_cal, y_test, n_bins=20)
        metrics_dict = metrics.calculate_all_metrics()
        calibration_time = time.time() - start_time

        results = {
            "Metric": name.replace("_", " ").capitalize(),
            **metrics_dict,
            "Calibration Time (s)": calibration_time,
        }

        results_csv_file = os.path.join(technique_dirs[name], "results.csv")
        pd.DataFrame([results]).to_csv(results_csv_file, index=False)
        print(f"{name.replace('_', ' ').capitalize()} Metrics saved to {results_csv_file}")
        return results
    except Exception as e:
        logger.error(f"Error calibrating with {name}: {e}")
        return None



# def calibrate_trust_score(X_train, y_train, X_test, y_test_pred, y_test, trust_alpha, technique_dirs, use_filtering=True):
#     try:
#         start_time = time.time()
#         filter_mode = "density" if use_filtering else "none"
#         logger.info(f"Starting Trust Score calibration with filtering: {filter_mode}")

#         # Ensure consistent dimensions
#         X_train_flat = X_train.reshape(X_train.shape[0], -1) if len(X_train.shape) > 2 else X_train
#         X_test_flat = X_test.reshape(X_test.shape[0], -1) if len(X_test.shape) > 2 else X_test

#         # Fit Trust Score Calibrator
#         trust_score_calibrator = TrustScoreCalibrator(k=10, alpha=trust_alpha, filtering=filter_mode)
#         trust_score_calibrator.fit(X_train_flat, y_train)
#         calibrated_probs = trust_score_calibrator.calibrate(X_test_flat, y_test_pred)

#         # Evaluate calibrated results
#         y_test_pred_cal = np.argmax(calibrated_probs, axis=1)
#         metrics = CalibrationMetrics(calibrated_probs, y_test_pred_cal, y_test, n_bins=20)
#         metrics_dict = metrics.calculate_all_metrics()

#         calibration_time = time.time() - start_time
#         results = {
#             "Metric": f"Trust Score Geometric {'Filtered' if use_filtering else 'Unfiltered'}",
#             **metrics_dict,
#             "Calibration Time (s)": calibration_time,
#         }

#         # Save results
#         filter_tag = "filtered" if use_filtering else "unfiltered"
#         results_csv_file = os.path.join(technique_dirs[f"trust_score_{filter_tag}"], "results.csv")
#         pd.DataFrame([results]).to_csv(results_csv_file, index=False)
#         print(f"Trust Score {filter_tag.capitalize()} Metrics saved to {results_csv_file}")
#         return results
#     except Exception as e:
#         logger.error(f"Error during Trust Score calibration ({filter_mode}): {e}")
#         return None

def calibrate_trust_score(model, X_train, y_train, X_val, y_val, X_test, y_test, trust_alpha, 
                         technique_dirs, use_filtering=True, use_binning=True, n_bins=50):
    try:
        start_time = time.time()
        logger.info(f"Starting Geometric Trust Score calibration with binning: {use_binning}")

        # Ensure consistent dimensions
        X_train_flat = X_train.reshape(X_train.shape[0], -1) if len(X_train.shape) > 2 else X_train
        X_val_flat = X_val.reshape(X_val.shape[0], -1) if len(X_val.shape) > 2 else X_val
        X_test_flat = X_test.reshape(X_test.shape[0], -1) if len(X_test.shape) > 2 else X_test

        # Initialize and fit Geometric Trust Score Calibrator
        geometric_trust = GeometricCalibratorTrust(
            model=model,
            X_train=X_train_flat,
            y_train=y_train,
            k=10,
            min_dist=1e-12,
            use_binning=use_binning,
            n_bins=n_bins,
            use_filtering=use_filtering,
            alpha=trust_alpha
        )


        # Fit calibrator using validation set
        geometric_trust.fit(X_val_flat, y_val)

        # Get calibrated probabilities
        calibrated_probs = geometric_trust.calibrate(X_test_flat)

        # Evaluate calibrated results
        y_test_pred_cal = np.argmax(calibrated_probs, axis=1)
        metrics = CalibrationMetrics(calibrated_probs, y_test_pred_cal, y_test, n_bins=20)
        metrics_dict = metrics.calculate_all_metrics()

        calibration_time = time.time() - start_time
        results = {
            "Metric": f"Geometric Trust Score {'Binned' if use_binning else 'Unbinned'}",
            **metrics_dict,
            "Calibration Time (s)": calibration_time,
        }

        # Save results
        binning_tag = "binned" if use_binning else "unbinned"
        results_csv_file = os.path.join(technique_dirs[f"geometric_trust_{binning_tag}"], "results.csv")
        pd.DataFrame([results]).to_csv(results_csv_file, index=False)
        print(f"Geometric Trust Score {binning_tag.capitalize()} Metrics saved to {results_csv_file}")
        return results

    except Exception as e:
        logger.error(f"Error during Geometric Trust Score calibration: {e}")
        return None
    


def compute_uncalibrated_metrics(features_test, y_test_pred, y_test, train_size, val_size, test_size):
    """Compute metrics for the uncalibrated model."""
    metrics_uncalibrated = CalibrationMetrics(features_test, y_test_pred, y_test, n_bins=20)
    metrics_dict = metrics_uncalibrated.calculate_all_metrics()
    results = {
        "Metric": "Uncalibrated",
        **metrics_dict,
        "Calibration Time (s)": "N/A",
        "Train Size": train_size,
        "Validation Size": val_size,
        "Test Size": test_size
    }
    print(f"Uncalibrated Metrics: {metrics_dict}")
    return results


def save_results(results, all_results_dir):
    """Save all results to a CSV file."""
    results_df = pd.DataFrame(results)
    results_csv_file = os.path.join(all_results_dir, "all_results.csv")
    results_df.to_csv(results_csv_file, index=False)
    print(f"All results saved to {results_csv_file}")

In [10]:
def main(dataset_name, random_state, model_type="cnn", metric="L2", transformed=False, trust_alpha=0.1,
         compression_types=None, compression_params=2):
    # Initialize directories
    base_dir, technique_dirs, all_results_dir = initialize_directories(
        f"/cs/cs_groups/cliron_group/Calibrato/{dataset_name}/{random_state}/{model_type}/{metric}",
        transformed, dataset_name, random_state, model_type, metric, trust_alpha
    )
    compression = None
    if compression_types:
        logger.info(f"Initializing Compression with types: {compression_types}, params: {compression_params}")
        compression = Compression(compression_types=compression_types, compression_params=compression_params)

    # Load data
    X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(dataset_name, random_state, transformed)

    # Train or load model
    dataset_epochs = {
        "mnist": 10,
        "fashion_mnist": 20,
        "cifar10": 35,
        "cifar100": 100,
        "sign_language": 25,
        "tiny_imagenet": 30,
    }
    model = prepare_model(X_train, y_train, X_val, y_val, dataset_name, random_state, model_type, dataset_epochs)

    # Get dataset sizes
    train_size, val_size, test_size = len(X_train), len(X_val), len(X_test)

    # Preprocess data based on the model type
    if model_type in ["cnn", "densenet", "pretrained_resnet"]:
        # For TensorFlow/Keras models, no additional preprocessing is needed
        logger.info(f"Using TensorFlow/Keras model: {model_type}")
        features_test = model.predict(X_test)
        y_test_pred = np.argmax(features_test, axis=1)
        features_val = model.predict(X_val)
        y_val_pred = np.argmax(features_val, axis=1)
    else:
        # For sklearn models, flatten the input data to 2D
        logger.info(f"Using sklearn model: {model_type}")
        X_test = X_test.reshape(X_test.shape[0], -1) if len(X_test.shape) > 2 else X_test
        X_val = X_val.reshape(X_val.shape[0], -1) if len(X_val.shape) > 2 else X_val

        features_test = model.predict_proba(X_test)
        y_test_pred = model.predict(X_test)
        features_val = model.predict_proba(X_val)
        y_val_pred = model.predict(X_val)

    logger.info(f"Feature extraction complete for model type: {model_type}")
    # Uncalibrated metrics
    results = []
    uncalibrated_results = compute_uncalibrated_metrics(features_test, y_test_pred, y_test, train_size, val_size, test_size)
    results.append(uncalibrated_results)

    # Define calibration methods
    calibrations = {
        "isotonic": IsotonicCalibrator(),
        "platt": PlattCalibrator(),
        "temperature": TemperatureScalingCalibrator(),
        # "trust_score_filtered": {"method": "trust_score", "use_filtering": True},
        "trust_score_unfiltered": {"method": "trust_score", "use_filtering": False},
        "faiss_exact": {"library": "faiss", "mode": "exact"},
        "faiss_binned": {"library": "faiss", "mode": "exact", "binned": True},
        "knn": {"library": "knn", "mode": None},
        "knn_binned": {"library": "knn", "mode": None, "binned": True},
    }


    # Add faiss_approximate if applicable
    if dataset_name.lower() not in ["cifar100", "tiny_imagenet"]:
        calibrations["faiss_approximate"] = {"library": "faiss", "mode": "approximate"}
        calibrations["trust_score_unfiltered"] = {"method": "trust_score", "use_filtering": False}
        calibrations["trust_score_filtered"] = {"method": "trust_score", "use_filtering": True}

    # Loop through calibration methods
    for name, calibrator in calibrations.items():
        # Initialize variables at the start of each iteration
        trust_results = None
        parametric_results = None
        geometric_results = None

        if name.startswith("trust_score"):
            # Trust Score calibrations
            use_filtering = calibrator["use_filtering"]
            trust_results = calibrate_trust_score(
                model=model,  # Add model parameter
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,  # Add validation data
                y_val=y_val,  # Add validation labels
                X_test=X_test,
                y_test=y_test,
                trust_alpha=trust_alpha,
                technique_dirs=technique_dirs,
                use_filtering=use_filtering
            )
            if trust_results:
                    results.append(trust_results)
        elif name in ["isotonic", "platt", "temperature"]:
            # Parametric calibrations
            parametric_results = calibrate_parametric(features_val, y_val, features_test, y_test, calibrator, name, technique_dirs)
            if parametric_results:
                results.append(parametric_results)
        else:
            # Geometric calibrations
            use_binning = calibrator.get("binned", False)  # Check if binning is enabled
            geometric_results = calibrate_geometric(
                model, X_train, y_train, X_val, y_val, X_test, y_test, calibrator, name, technique_dirs, metric, use_binning, compression
            )
            if geometric_results:
                results.append(geometric_results)

    # Save all results
    save_results(results, all_results_dir)

In [11]:
main(dataset_name="MNIST", random_state=0, model_type="RF", metric="cosine")

2024-12-10 09:34:51,379 - INFO - Loading dataset: MNIST
2024-12-10 09:34:51,668 - INFO - Combining and splitting data
2024-12-10 09:34:51,739 - INFO - Expanding dimensions for grayscale images
2024-12-10 09:34:51,838 - INFO - Data normalization completed
2024-12-10 09:34:51,842 - INFO - Using 10 epochs for training.
2024-12-10 09:34:51,845 - INFO - Loading pre-trained RF model from MNIST/0/saved_models/RF_model.keras.
2024-12-10 09:34:52,076 - INFO - Using sklearn model: RF
2024-12-10 09:34:54,557 - INFO - Feature extraction complete for model type: RF
2024-12-10 09:34:54,558 - INFO - Starting Expected Calibration Error (ECE) calculation.
2024-12-10 09:34:54,564 - INFO - Final ECE value: 0.1657791053902575
2024-12-10 09:34:54,564 - INFO - Calculating Maximum Calibration Error.
2024-12-10 09:34:54,569 - INFO - Final MCE value: 0.43140644562248853
2024-12-10 09:34:54,569 - INFO - Calculating Brier Score.
2024-12-10 09:34:54,571 - INFO - Brier Score: 0.011209931736558482
2024-12-10 09:34:

Uncalibrated Metrics: {'ECE': 0.1657791053902575, 'MCE': 0.43140644562248853, 'Brier Score': 0.011209931736558482, 'Log Loss': 0.28945081037284176, 'ACE': 0.2223295656591421, 'Binned Likelihood': -4244.225970238496, 'CRSP': 0.07265364971641872, 'Sharpness': 0.7990066088954567, 'Entropy': 0.707269091096669, 'NLL': 0.28945081037133324}
Isotonic Metrics saved to /cs/cs_groups/cliron_group/Calibrato/MNIST/0/RF/cosine/isotonic/results.csv


2024-12-10 09:34:54,878 - INFO - Platt Calibrator fitted successfully for all classes.
2024-12-10 09:34:54,879 - INFO - Calibrating probabilities using Platt Scaling.
2024-12-10 09:34:54,885 - INFO - Starting Expected Calibration Error (ECE) calculation.
2024-12-10 09:34:54,889 - INFO - Final ECE value: 0.03702463707063028
2024-12-10 09:34:54,889 - INFO - Calculating Maximum Calibration Error.
2024-12-10 09:34:54,893 - INFO - Final MCE value: 0.2472228804939257
2024-12-10 09:34:54,893 - INFO - Calculating Brier Score.
2024-12-10 09:34:54,895 - INFO - Brier Score: 0.00556554930124345
2024-12-10 09:34:54,895 - INFO - Calculating Log Loss.
2024-12-10 09:34:54,904 - INFO - Calculating Average Calibration Error (ACE).
2024-12-10 09:34:54,907 - INFO - Final ACE value: 0.06301385033090981
2024-12-10 09:34:54,908 - INFO - Calculating Binned Likelihood.
2024-12-10 09:34:54,912 - INFO - Final Binned Likelihood: -4358.335622419387
2024-12-10 09:34:54,913 - INFO - Calculating CRSP.
2024-12-10 09:3

Platt Metrics saved to /cs/cs_groups/cliron_group/Calibrato/MNIST/0/RF/cosine/platt/results.csv


2024-12-10 09:34:55,136 - INFO - Starting Expected Calibration Error (ECE) calculation.
2024-12-10 09:34:55,142 - INFO - Final ECE value: 0.027499796641780375
2024-12-10 09:34:55,144 - INFO - Calculating Maximum Calibration Error.
2024-12-10 09:34:55,150 - INFO - Final MCE value: 0.5611157343777573
2024-12-10 09:34:55,152 - INFO - Calculating Brier Score.
2024-12-10 09:34:55,155 - INFO - Brier Score: 0.0061748994326638095
2024-12-10 09:34:55,157 - INFO - Calculating Log Loss.
2024-12-10 09:34:55,170 - INFO - Calculating Average Calibration Error (ACE).
2024-12-10 09:34:55,175 - INFO - Final ACE value: 0.14901710344154728
2024-12-10 09:34:55,179 - INFO - Calculating Binned Likelihood.
2024-12-10 09:34:55,186 - INFO - Final Binned Likelihood: -8026.865556498447
2024-12-10 09:34:55,188 - INFO - Calculating CRSP.
2024-12-10 09:34:55,191 - INFO - Final CRSP value: 0.03009619272019163
2024-12-10 09:34:55,193 - INFO - Calculating Sharpness.
2024-12-10 09:34:55,196 - INFO - Final Sharpness val

Temperature Metrics saved to /cs/cs_groups/cliron_group/Calibrato/MNIST/0/RF/cosine/temperature/results.csv


2024-12-10 09:34:56,337 - INFO - Initialized GeometricCalibrator with n_classes=None, bins=15, temperature=1.0
2024-12-10 09:34:56,340 - INFO - GeometricCalibrator: Using custom StabilitySpace provided by user.
2024-12-10 09:34:56,340 - INFO - Initialized GeometricCalibrator with model RandomForestClassifier and fitting function IsotonicRegression.
2024-12-10 09:34:56,341 - INFO - GeometricCalibrator: Fitting with validation data using balanced accuracy and rounded stability.
2024-12-10 09:34:56,341 - INFO - Sklearn model detected
2024-12-10 09:34:57,595 - INFO - Calculating stability using faiss.
2024-12-10 09:34:57,596 - INFO - Calculating stability using FAISS.
Calculating Stability (FAISS):  14%|█▎        | 1915/14000 [00:21<02:18, 87.31sample/s]


KeyboardInterrupt: 