# $\alpha$=0.1

In [1]:
import torch
import torchvision.transforms as transforms              
from torchvision.datasets import ImageFolder
import torchvision.models as models
from torchvision.models import Inception_V3_Weights
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import Subset
from src.temperature_scaling import ModelWithTemperature
from src.saps import split_data_set, saps_scores, saps_classification, eval_aps

# load pre-trained model InceptionV3 and set mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.inception_v3(weights=Inception_V3_Weights.IMAGENET1K_V1).to(device)

#  Reprocess: Center Crop and then resize to 299*299
data_transform = transforms.Compose([
    transforms.CenterCrop(299),
    transforms.Resize(299), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

sorted_val_path = "D:\\Download\\ImageNet-1K\\Validation_Set\\sorted_ImageNet_val"
dataset = ImageFolder(root=sorted_val_path, transform=data_transform)

# Temperature Scaling
model.eval() # only use output.logits of Inception's output
subset_size = len(dataset) // 10
indices = np.random.choice(len(dataset), subset_size, replace=False)
subset_dataset = Subset(dataset, indices)
train_loader = DataLoader(subset_dataset, batch_size=32, shuffle=False, num_workers=4)

model_with_temp = ModelWithTemperature(model, temperature = 1.0).to(device)
model_with_temp.set_temperature(train_loader)
model_with_temp.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1
# Hyperparameter 
lambda_ = 0.15

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("\n")
print("SAPS Classification, Start!\n")


for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False,num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - 0.1)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {100 * avg_coverage:.2f}%\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Before temperature - NLL: 1.072, ECE: 0.023
Optimal temperature: 0.967
After temperature - NLL: 1.063, ECE: 0.030


SAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.9859709382057192
Average Prediction Set Size After APS in runs 1: 2.567
Average Coverage Rate in runs 1: 90.03%

Running experiment 2/10...
q_hat = 0.9852838039398194
Average Prediction Set Size After APS in runs 2: 2.5364
Average Coverage Rate in runs 2: 90.11%

Running experiment 3/10...
q_hat = 0.9877974450588226
Average Prediction Set Size After APS in runs 3: 2.57048
Average Coverage Rate in runs 3: 90.36%

Running experiment 4/10...
q_hat = 0.9778015196323395
Average Prediction Set Size After APS in runs 4: 2.52992
Average Coverage Rate in runs 4: 89.74%

Running experiment 5/10...
q_hat = 0.9825943648815155
Average Prediction Set Size After APS in runs 5: 2.54012
Average Coverage Rate in runs 5: 89.99%

Running experiment 6/10...
q_hat = 0.9856502950191502
Average Prediction Set Size After APS in run

# $\alpha$=0.2

In [4]:
# error rate
alpha = 0.2
# Hyperparameter 
lambda_ = 0.2

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("\n")
print("SAPS Classification, Start!\n")


for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False,num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {100 * avg_coverage:.2f}%\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")



SAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.8379756212234497
Average Prediction Set Size After APS in runs 1: 1.6626
Average Coverage Rate in runs 1: 79.98%

Running experiment 2/10...
q_hat = 0.832955825328827
Average Prediction Set Size After APS in runs 2: 1.62996
Average Coverage Rate in runs 2: 79.43%

Running experiment 3/10...
q_hat = 0.843045997619629
Average Prediction Set Size After APS in runs 3: 1.67088
Average Coverage Rate in runs 3: 80.70%

Running experiment 4/10...
q_hat = 0.8337469339370728
Average Prediction Set Size After APS in runs 4: 1.65036
Average Coverage Rate in runs 4: 79.50%

Running experiment 5/10...
q_hat = 0.8381602287292481
Average Prediction Set Size After APS in runs 5: 1.65888
Average Coverage Rate in runs 5: 80.01%

Running experiment 6/10...
q_hat = 0.8359057307243347
Average Prediction Set Size After APS in runs 6: 1.65256
Average Coverage Rate in runs 6: 79.84%

Running experiment 7/10...
q_hat = 0.8336599826812745
Averag

# $\alpha$=0.05

In [3]:
# error rate
alpha = 0.05
# Hyperparameter 
lambda_ = 0.2

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("\n")
print("SAPS Classification, Start!\n")


for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False,num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

    # calculate q_hat
    calib_scores, _ = saps_scores(model, calib_loader, alpha, lambda_, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = saps_classification(model, test_loader, q_hat, lambda_, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {100 * avg_coverage:.2f}%\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")



SAPS Classification, Start!

Running experiment 1/10...
q_hat = 2.135695004463195
Average Prediction Set Size After APS in runs 1: 7.9186
Average Coverage Rate in runs 1: 94.96%

Running experiment 2/10...
q_hat = 2.138794517517089
Average Prediction Set Size After APS in runs 2: 7.91424
Average Coverage Rate in runs 2: 95.01%

Running experiment 3/10...
q_hat = 2.1840238094329814
Average Prediction Set Size After APS in runs 3: 8.1576
Average Coverage Rate in runs 3: 95.14%

Running experiment 4/10...
q_hat = 2.1064960718154904
Average Prediction Set Size After APS in runs 4: 7.78168
Average Coverage Rate in runs 4: 94.88%

Running experiment 5/10...
q_hat = 2.132099151611327
Average Prediction Set Size After APS in runs 5: 7.89776
Average Coverage Rate in runs 5: 94.93%

Running experiment 6/10...
q_hat = 2.1743104577064485
Average Prediction Set Size After APS in runs 6: 8.10788
Average Coverage Rate in runs 6: 95.10%

Running experiment 7/10...
q_hat = 2.081215882301328
Average P

## Result  
$\alpha$=0.1  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 2.55
- Final Average Coverage: 90.05%  

$\alpha$=0.2  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 1.65
- Final Average Coverage: 79.91%  

$\alpha$=0.05  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 7.97
- Final Average Coverage: 95.00%