# $\alpha$=0.1

In [2]:
import numpy as np
import torch
import torchvision.transforms as transforms              
from torchvision.datasets import CIFAR100      
from torch.utils.data import DataLoader
import timm
import detectors
from src.saps import split_data_set, saps_scores, saps_classification, eval_aps
from src.temperature_scaling import ModelWithTemperature

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load fine-tuned model
model = timm.create_model("resnet34_cifar100", pretrained=True)
model = model.to(device)

# reprocess the images from CIFAR100
data_transform = transforms.Compose([
    transforms.ToTensor(),          # transfer to tensor
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  # normalize
])
# load images from CIFAR10
dataset = CIFAR100(root="../../data", train=False, download=True, transform=data_transform)

temp_scal_loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = ModelWithTemperature(model, temperature=4.85).to(device)
model.set_temperature(temp_scal_loader)

model.eval()

# error rate
alpha = 0.1
lambda_ = 0.05
num_runs = 10

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, t_cal, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Files already downloaded and verified
Before temperature - NLL: 1.178, ECE: 0.098
Optimal temperature: 4.703
After temperature - NLL: 3.092, ECE: 0.671
SAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.18209821879863752
Average Prediction Set Size After APS in runs 1: 2.9368
Average Coverage Rate in runs 1: 0.9052

Running experiment 2/10...
t_cal = 0.17731191962957393
Average Prediction Set Size After APS in runs 2: 2.8474
Average Coverage Rate in runs 2: 0.902

Running experiment 3/10...
t_cal = 0.1811189100146295
Average Prediction Set Size After APS in runs 3: 2.9224
Average Coverage Rate in runs 3: 0.9054

Running experiment 4/10...
t_cal = 0.1817201212048531
Average Prediction Set Size After APS in runs 4: 2.94
Average Coverage Rate in runs 4: 0.906

Running experiment 5/10...
t_cal = 0.17928484529256838
Average Prediction Set Size After APS in runs 5: 2.8952
Average Coverage Rate in runs 5: 0.906

Running experiment 6/10...
t_cal = 0.1728466868400576
Average Pred

# $\alpha$=0.2

In [13]:
# error rate
alpha = 0.2
lambda_ = 0.4

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, t_cal, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

SAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.248226168751717
Average Prediction Set Size After APS in runs 1: 1.3962
Average Coverage Rate in runs 1: 0.8038

Running experiment 2/10...
t_cal = 0.25175541639328014
Average Prediction Set Size After APS in runs 2: 1.4148
Average Coverage Rate in runs 2: 0.8076

Running experiment 3/10...
t_cal = 0.28140872120857285
Average Prediction Set Size After APS in runs 3: 1.5024
Average Coverage Rate in runs 3: 0.817

Running experiment 4/10...
t_cal = 0.2550368785858156
Average Prediction Set Size After APS in runs 4: 1.4318
Average Coverage Rate in runs 4: 0.8044

Running experiment 5/10...
t_cal = 0.26268626451492316
Average Prediction Set Size After APS in runs 5: 1.4578
Average Coverage Rate in runs 5: 0.8112

Running experiment 6/10...
t_cal = 0.22574977874755875
Average Prediction Set Size After APS in runs 6: 1.3632
Average Coverage Rate in runs 6: 0.7954

Running experiment 7/10...
t_cal = 0.25869715809822097
Average 

# $\alpha$=0.05

In [21]:
# error rate
alpha = 0.05
lambda_ = 0.01

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, t_cal, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

SAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.1449894234538079
Average Prediction Set Size After APS in runs 1: 7.0826
Average Coverage Rate in runs 1: 0.9524

Running experiment 2/10...
t_cal = 0.14321246966719628
Average Prediction Set Size After APS in runs 2: 6.9744
Average Coverage Rate in runs 2: 0.948

Running experiment 3/10...
t_cal = 0.1449691101908684
Average Prediction Set Size After APS in runs 3: 7.1166
Average Coverage Rate in runs 3: 0.9488

Running experiment 4/10...
t_cal = 0.14595374986529353
Average Prediction Set Size After APS in runs 4: 7.2
Average Coverage Rate in runs 4: 0.9502

Running experiment 5/10...
t_cal = 0.14182690754532815
Average Prediction Set Size After APS in runs 5: 6.8434
Average Coverage Rate in runs 5: 0.9466

Running experiment 6/10...
t_cal = 0.14436650052666666
Average Prediction Set Size After APS in runs 6: 7.0986
Average Coverage Rate in runs 6: 0.948

Running experiment 7/10...
t_cal = 0.1471379734575749
Average Pred

# Result
  
$\alpha$=0.1
- Final Average **Prediction Set Size： 2.85**
- Final Average **Coverage: 90.12%**  

$\alpha$=0.2
- Final Average **Prediction Set Size： 1.40**
- Final Average **Coverage: 80.17%**  

$\alpha$=0.05
- Final Average **Prediction Set Size： 7.05**
- Final Average **Coverage: 94.85%**