# $\alpha$=0.1

In [4]:
import numpy as np
import torch
import torchvision.transforms as transforms               # include image preprocess tools
from torchvision.datasets import CIFAR10        # for loading images from Pytorch CIFAR
from torch.utils.data import DataLoader
import detectors
import timm
from src.saps import split_data_set, saps_scores, saps_classification, eval_aps
from src.temperature_scaling import ModelWithTemperature

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load fine-tuned model
model = timm.create_model("resnet34_cifar10", pretrained=True)
model = model.to(device)

# reprocess the images from CIFAR
data_transform = transforms.Compose([
    transforms.ToTensor(),          # transfer to tensor
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])  # normalize
])
# load images from CIFAR10
dataset = CIFAR10(root="../../data", train=False, download=True, transform=data_transform)

# temperature scaling
temp_scal_loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = ModelWithTemperature(model, temperature=5.0).to(device)
model.set_temperature(temp_scal_loader)

model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1
lambda_ = 2.5

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    t_cal = np.quantile(calib_scores, 1 - 0.1)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, t_cal, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Files already downloaded and verified
Before temperature - NLL: 0.326, ECE: 0.046
Optimal temperature: 4.903
After temperature - NLL: 0.881, ECE: 0.465
SAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.5171721637248993
Average Prediction Set Size After APS in runs 1: 0.9842
Average Coverage Rate in runs 1: 0.9

Running experiment 2/10...
t_cal = 0.510045975446701
Average Prediction Set Size After APS in runs 2: 0.9752
Average Coverage Rate in runs 2: 0.891

Running experiment 3/10...
t_cal = 0.5145092487335206
Average Prediction Set Size After APS in runs 3: 0.986
Average Coverage Rate in runs 3: 0.9004

Running experiment 4/10...
t_cal = 0.5145278453826905
Average Prediction Set Size After APS in runs 4: 0.9852
Average Coverage Rate in runs 4: 0.8946

Running experiment 5/10...
t_cal = 0.5236619710922242
Average Prediction Set Size After APS in runs 5: 0.995
Average Coverage Rate in runs 5: 0.9118

Running experiment 6/10...
t_cal = 0.5187820732593537
Average Predictio

# $\alpha$=0.05

In [5]:
# error rate
alpha = 0.05
lambda_ = 2.2

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, t_cal, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

SAPS Classification, Start!

Running experiment 1/10...
t_cal = 1.1640583932399766
Average Prediction Set Size After APS in runs 1: 1.3132
Average Coverage Rate in runs 1: 0.9492

Running experiment 2/10...
t_cal = 1.004845201969148
Average Prediction Set Size After APS in runs 2: 1.2354
Average Coverage Rate in runs 2: 0.9432

Running experiment 3/10...
t_cal = 1.0786385357379928
Average Prediction Set Size After APS in runs 3: 1.2734
Average Coverage Rate in runs 3: 0.9478

Running experiment 4/10...
t_cal = 0.9620541632175451
Average Prediction Set Size After APS in runs 4: 1.2194
Average Coverage Rate in runs 4: 0.9406

Running experiment 5/10...
t_cal = 1.3850198864936838
Average Prediction Set Size After APS in runs 5: 1.4116
Average Coverage Rate in runs 5: 0.9576

Running experiment 6/10...
t_cal = 1.2233445584774039
Average Prediction Set Size After APS in runs 6: 1.34
Average Coverage Rate in runs 6: 0.9524

Running experiment 7/10...
t_cal = 1.0936897814273838
Average Predic

# Result
$\alpha$=0.1
- Final Average **Prediction Set Size： 0.99**
- Final Average **Coverage: 90.07%**  

$\alpha$=0.05
- Final Average **Prediction Set Size： 1.30**
- Final Average **Coverage: 94.93%**