# $\alpha$=0.1

In [4]:
import torch
import numpy as np
import torchvision.transforms as transforms               # include image preprocess tools
from torchvision.datasets import CIFAR100        # for loading images from Pytorch CIFAR
from torch.utils.data import DataLoader
import timm
import detectors
from src.temperature_scaling import ModelWithTemperature
from src.aps import split_data_set, aps_scores, aps_classification, eval_aps

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load fine-tuned model
model = timm.create_model("resnet18_cifar100", pretrained=True)
model.to(device)

# reprocess the images from CIFAR
data_transform = transforms.Compose([
    transforms.ToTensor(),          # transfer to tensor
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  # normalize
])
# load images from CIFAR10
dataset = CIFAR100(root="../../data", train=False, download=True, transform=data_transform)

# Tempreture Scaling
temp_scal_loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = ModelWithTemperature(model, temperature=4.85).to(device)
model.set_temperature(temp_scal_loader)

model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1

# contruct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("APS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = aps_scores(model, calib_loader, alpha, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = aps_classification(model, test_loader, q_hat, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Ratein runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)

print(f"Final Average Prediction Set Size: {final_avg_set_size}")
print(f"Final Average Coverage: {final_avg_coverage}")

Files already downloaded and verified
Before temperature - NLL: 1.122, ECE: 0.047
Optimal temperature: 4.704
After temperature - NLL: 3.093, ECE: 0.667
APS Classification, Start!

Running experiment 1/10...
q_hat = 0.18287866264581684
Total set size: 31997
Total coverage sets: 4519
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 6.3994
Average Coverage Ratein runs 1: 0.9038

Running experiment 2/10...
q_hat = 0.17874894440174105
Total set size: 30640
Total coverage sets: 4503
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 6.128
Average Coverage Ratein runs 2: 0.9006

Running experiment 3/10...
q_hat = 0.181287394464016
Total set size: 31850
Total coverage sets: 4523
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 6.37
Average Coverage Ratein runs 3: 0.9046

Running experiment 4/10...
q_hat = 0.17564161121845256
Total set size: 29764
Total coverage sets: 4497
Total samples amount: 5000
Average Predictio

# $\alpha$=0.2

In [5]:
# error rate
alpha = 0.2

# contruct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("APS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = aps_scores(model, calib_loader, alpha, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = aps_classification(model, test_loader, q_hat, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Ratein runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)

print(f"Final Average Prediction Set Size: {final_avg_set_size}")
print(f"Final Average Coverage: {final_avg_coverage}")

APS Classification, Start!

Running experiment 1/10...
q_hat = 0.12326962649822239
Total set size: 16166
Total coverage sets: 3998
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 3.2332
Average Coverage Ratein runs 1: 0.7996

Running experiment 2/10...
q_hat = 0.12327495068311695
Total set size: 16175
Total coverage sets: 4016
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 3.235
Average Coverage Ratein runs 2: 0.8032

Running experiment 3/10...
q_hat = 0.12524716556072238
Total set size: 16796
Total coverage sets: 4059
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 3.3592
Average Coverage Ratein runs 3: 0.8118

Running experiment 4/10...
q_hat = 0.1242065027356148
Total set size: 16267
Total coverage sets: 4018
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 3.2534
Average Coverage Ratein runs 4: 0.8036

Running experiment 5/10...
q_hat = 0.12496348023414615
Total set size:

# $\alpha$=0.05

In [6]:
# error rate
alpha = 0.05

# contruct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("APS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = aps_scores(model, calib_loader, alpha, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = aps_classification(model, test_loader, q_hat, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Ratein runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)

print(f"Final Average Prediction Set Size: {final_avg_set_size}")
print(f"Final Average Coverage: {final_avg_coverage}")

APS Classification, Start!

Running experiment 1/10...
q_hat = 0.2532627552747729
Total set size: 57123
Total coverage sets: 4765
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 11.4246
Average Coverage Ratein runs 1: 0.953

Running experiment 2/10...
q_hat = 0.24282747209072128
Total set size: 52935
Total coverage sets: 4737
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 10.587
Average Coverage Ratein runs 2: 0.9474

Running experiment 3/10...
q_hat = 0.24929274246096614
Total set size: 56205
Total coverage sets: 4742
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 11.241
Average Coverage Ratein runs 3: 0.9484

Running experiment 4/10...
q_hat = 0.24088426455855383
Total set size: 52392
Total coverage sets: 4721
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 10.4784
Average Coverage Ratein runs 4: 0.9442

Running experiment 5/10...
q_hat = 0.2559720039367677
Total set size

# Result
  
$\alpha$=0.1
- Final Average **Prediction Set Size： 6.27**
- Final Average **Coverage: 90.17%**  

$\alpha$=0.2
- Final Average **Prediction Set Size： 3.27**
- Final Average **Coverage: 80.17%**  

$\alpha$=0.05
- Final Average **Prediction Set Size： 11.13**
- Final Average **Coverage: 94.83%**