# $\alpha$=0.1

In [14]:
import torch
import numpy as np
import torchvision.transforms as transforms               # include image preprocess tools
from torchvision.datasets import CIFAR100        # for loading images from Pytorch CIFAR
from torch.utils.data import DataLoader
import timm
import detectors
from src.raps import raps_scores, raps_classification, eval_aps, split_data_set
from src.temperature_scaling import ModelWithTemperature

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load fine-tuned model
model = timm.create_model("vgg16_bn_cifar100", pretrained=True)
model = model.to(device)

# reprocess the images from CIFAR
data_transform = transforms.Compose([
    transforms.ToTensor(),          # transfer to tensor
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  # normalize
])
# load images from CIFAR100
dataset = CIFAR100(root="../../data", train=False, download=True, transform=data_transform)

# temperature scaling
temp_scal_loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = ModelWithTemperature(model, temperature=4.85).to(device)
model.set_temperature(temp_scal_loader)

model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1
lambda_reg = 0.01
k_reg = 1

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_reg, k_reg, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, t_cal, lambda_reg, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Files already downloaded and verified
Before temperature - NLL: 1.484, ECE: 0.157
Optimal temperature: 4.690
After temperature - NLL: 2.903, ECE: 0.613
RAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.21467490792274482
Total set size: 24504
Total coverage sets: 4502
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 4.9008
Average Coverage Ratein runs 1: 0.9004

Running experiment 2/10...
t_cal = 0.21834406405687387
Total set size: 25155
Total coverage sets: 4535
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 5.031
Average Coverage Ratein runs 2: 0.907

Running experiment 3/10...
t_cal = 0.22038835436105736
Total set size: 25606
Total coverage sets: 4543
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 5.1212
Average Coverage Ratein runs 3: 0.9086

Running experiment 4/10...
t_cal = 0.21140019744634633
Total set size: 23677
Total coverage sets: 4504
Total samples amount: 5000
Average Predi

# $\alpha$=0.2

In [16]:
# error rate
alpha = 0.2
lambda_reg = 0.1
k_reg = 2

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_reg, k_reg, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, t_cal, lambda_reg, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

RAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.1471326678991319
Total set size: 9233
Total coverage sets: 4020
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 1.8466
Average Coverage Ratein runs 1: 0.804

Running experiment 2/10...
t_cal = 0.14634882509708408
Total set size: 9162
Total coverage sets: 4009
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 1.8324
Average Coverage Ratein runs 2: 0.8018

Running experiment 3/10...
t_cal = 0.15335183143615724
Total set size: 9475
Total coverage sets: 4059
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 1.895
Average Coverage Ratein runs 3: 0.8118

Running experiment 4/10...
t_cal = 0.14686185717582728
Total set size: 9164
Total coverage sets: 3981
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 1.8328
Average Coverage Ratein runs 4: 0.7962

Running experiment 5/10...
t_cal = 0.15219205319881443
Total set size: 941

# $\alpha$=0.05

In [29]:
# error rate
alpha = 0.05
lambda_reg = 0.001
k_reg = 1

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_reg, k_reg, device)
    t_cal = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"t_cal = {t_cal}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, t_cal, lambda_reg, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

RAPS Classification, Start!

Running experiment 1/10...
t_cal = 0.303019055724144
Total set size: 68212
Total coverage sets: 4725
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 13.6424
Average Coverage Ratein runs 1: 0.945

Running experiment 2/10...
t_cal = 0.3243697315454484
Total set size: 76158
Total coverage sets: 4763
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 15.2316
Average Coverage Ratein runs 2: 0.9526

Running experiment 3/10...
t_cal = 0.31974911093711866
Total set size: 74455
Total coverage sets: 4761
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 14.891
Average Coverage Ratein runs 3: 0.9522

Running experiment 4/10...
t_cal = 0.3114962831139567
Total set size: 71015
Total coverage sets: 4748
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 14.203
Average Coverage Ratein runs 4: 0.9496

Running experiment 5/10...
t_cal = 0.3065984219312669
Total set size: 

# Result
  
$\alpha$=0.1
- Final Average **Prediction Set Size： 4.72**
- Final Average **Coverage: 89.82%**  

$\alpha$=0.2
- Final Average **Prediction Set Size： 1.83**
- Final Average **Coverage: 79.78%**  

$\alpha$=0.05
- Final Average **Prediction Set Size： 14.36**
- Final Average **Coverage: 94.89%**