# $\alpha$=0.1

In [19]:
import torch
import torchvision.transforms as transforms              
from torchvision.datasets import CIFAR100      
from torch.utils.data import DataLoader
from src.inception_cifar100 import inceptionv3
import numpy as np
from src.saps import split_data_set, saps_scores, saps_classification, eval_aps


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dict_path = "C:\\Users\\jiayang\\ipynb\\trainedModel\\Inception_CIFAR100.pth"
model = inceptionv3()
model.load_state_dict(torch.load(dict_path, map_location=device, weights_only=True))
model.to(device)

# preprocess the images from CIFAR10
data_transform = transforms.Compose([
    transforms.ToTensor(),         
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  
])

# load data set from CIFAR100
dataset = CIFAR100(root="../../data", train=False, download=True,transform=data_transform)

model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1
#Hyperparameter
lambda_ = 0.13

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - 0.1)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Files already downloaded and verified
SAPS Classification, Start!

Running experiment 1/10...
q_hat = 1.0964112997055058
Average Prediction Set Size After APS in runs 1: 3.1784
Average Coverage Rate in runs 1: 0.907

Running experiment 2/10...
q_hat = 1.0756803274154667
Average Prediction Set Size After APS in runs 2: 3.0276
Average Coverage Rate in runs 2: 0.8994

Running experiment 3/10...
q_hat = 1.0740899562835697
Average Prediction Set Size After APS in runs 3: 3.0244
Average Coverage Rate in runs 3: 0.8952

Running experiment 4/10...
q_hat = 1.074631273746491
Average Prediction Set Size After APS in runs 4: 3.0036
Average Coverage Rate in runs 4: 0.8986

Running experiment 5/10...
q_hat = 1.0885197997093201
Average Prediction Set Size After APS in runs 5: 3.1088
Average Coverage Rate in runs 5: 0.9028

Running experiment 6/10...
q_hat = 1.0753330588340764
Average Prediction Set Size After APS in runs 6: 3.0534
Average Coverage Rate in runs 6: 0.8976

Running experiment 7/10...
q_

# $\alpha$=0.3

In [6]:
# error rate
alpha = 0.3
#Hyperparameter
lambda_ = 0.35

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Files already downloaded and verified
SAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.852007782459259
Average Prediction Set Size After APS in runs 1: 1.244
Average Coverage Rate in runs 1: 0.7052

Running experiment 2/10...
q_hat = 0.8474344193935394
Average Prediction Set Size After APS in runs 2: 1.248
Average Coverage Rate in runs 2: 0.6914

Running experiment 3/10...
q_hat = 0.8496175467967987
Average Prediction Set Size After APS in runs 3: 1.2398
Average Coverage Rate in runs 3: 0.6916

Running experiment 4/10...
q_hat = 0.8487005472183228
Average Prediction Set Size After APS in runs 4: 1.2414
Average Coverage Rate in runs 4: 0.697

Running experiment 5/10...
q_hat = 0.8636034548282622
Average Prediction Set Size After APS in runs 5: 1.2714
Average Coverage Rate in runs 5: 0.7132

Running experiment 6/10...
q_hat = 0.8448722898960113
Average Prediction Set Size After APS in runs 6: 1.2474
Average Coverage Rate in runs 6: 0.6886

Running experiment 7/10...
q_ha

# $\alpha$=0.2

In [8]:
# error rate
alpha = 0.2
#Hyperparameter
lambda_ = 0.13

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

SAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.9985983371734624
Average Prediction Set Size After APS in runs 1: 1.5232
Average Coverage Rate in runs 1: 0.8034

Running experiment 2/10...
q_hat = 0.98546462059021
Average Prediction Set Size After APS in runs 2: 1.5106
Average Coverage Rate in runs 2: 0.7934

Running experiment 3/10...
q_hat = 0.9982336521148683
Average Prediction Set Size After APS in runs 3: 1.5332
Average Coverage Rate in runs 3: 0.8018

Running experiment 4/10...
q_hat = 0.9930525898933411
Average Prediction Set Size After APS in runs 4: 1.5204
Average Coverage Rate in runs 4: 0.8014

Running experiment 5/10...
q_hat = 1.0012694835662845
Average Prediction Set Size After APS in runs 5: 1.5426
Average Coverage Rate in runs 5: 0.8088

Running experiment 6/10...
q_hat = 0.988970482349396
Average Prediction Set Size After APS in runs 6: 1.5226
Average Coverage Rate in runs 6: 0.7958

Running experiment 7/10...
q_hat = 0.9901547908782959
Average Predic

# $\alpha$=0.05  

In [12]:
# error rate
alpha = 0.05
#Hyperparameter
lambda_ = 0.06

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("SAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

SAPS Classification, Start!

Running experiment 1/10...
q_hat = 1.126237314939499
Average Prediction Set Size After APS in runs 1: 6.2174
Average Coverage Rate in runs 1: 0.9548

Running experiment 2/10...
q_hat = 1.120194458961487
Average Prediction Set Size After APS in runs 2: 6.1512
Average Coverage Rate in runs 2: 0.9516

Running experiment 3/10...
q_hat = 1.0986760258674622
Average Prediction Set Size After APS in runs 3: 5.7972
Average Coverage Rate in runs 3: 0.946

Running experiment 4/10...
q_hat = 1.1138765454292299
Average Prediction Set Size After APS in runs 4: 6.0104
Average Coverage Rate in runs 4: 0.9496

Running experiment 5/10...
q_hat = 1.125487130880356
Average Prediction Set Size After APS in runs 5: 6.1846
Average Coverage Rate in runs 5: 0.9534

Running experiment 6/10...
q_hat = 1.1125442326068884
Average Prediction Set Size After APS in runs 6: 6.0622
Average Coverage Rate in runs 6: 0.9518

Running experiment 7/10...
q_hat = 1.1186222314834597
Average Predict

#  Result  

$\alpha$=0.1  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 3.06
- Final Average Coverage: 89.91%  

$\alpha$=0.3  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 1.25
- Final Average Coverage: 69.57%  

$\alpha$=0.2  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 1.52
- Final Average Coverage: 79.74%  

$\alpha$=0.05  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 6.08
- Final Average Coverage: 95.08%