# $\alpha$=0.1

In [7]:
import torch
import torchvision.transforms as transforms              
from torchvision.datasets import CIFAR100      
from torch.utils.data import DataLoader
from src.inception_cifar100 import inceptionv3
from src.temperature_scaling import ModelWithTemperature
import numpy as np
from src.raps import split_data_set, raps_scores, raps_classification, eval_aps


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dict_path = "C:\\Users\\jiayang\\ipynb\\trainedModel\\Inception_CIFAR100.pth"
model = inceptionv3()
model.load_state_dict(torch.load(dict_path, map_location=device, weights_only=True))
model.to(device)

# preprocess the images from CIFAR10
data_transform = transforms.Compose([
    transforms.ToTensor(),         
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  
])

# load data set from CIFAR100
dataset = CIFAR100(root="../../data", train=False, download=True,transform=data_transform)

# Temperature Scaling
temp_scal_loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = ModelWithTemperature(model, temperature=0.5).to(device)
model.set_temperature(temp_scal_loader)
model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1
#Hyperparameter
lambda_ = 0.02
k_reg = 7

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_, k_reg, device)
    q_hat = np.quantile(calib_scores, 1 - 0.1)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, q_hat, lambda_, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

Files already downloaded and verified
Before temperature - NLL: 1.387, ECE: 0.123
Optimal temperature: 0.633
After temperature - NLL: 1.877, ECE: 0.206
RAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.9980654358863831
Total set size: 15655
Total coverage sets: 4540
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 3.131
Average Coverage Rate in runs 1: 0.908

Running experiment 2/10...
q_hat = 0.9977879881858825
Total set size: 15599
Total coverage sets: 4501
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 3.1198
Average Coverage Rate in runs 2: 0.9002

Running experiment 3/10...
q_hat = 0.9975217282772064
Total set size: 15368
Total coverage sets: 4489
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 3.0736
Average Coverage Rate in runs 3: 0.8978

Running experiment 4/10...
q_hat = 0.99721839427948
Total set size: 14792
Total coverage sets: 4473
Total samples amount: 5000
Average Predicti

# $\alpha$=0.3

In [7]:
# error rate
alpha = 0.3
#Hyperparameter
lambda_ = 0.02
k_reg = 4

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_, k_reg, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, q_hat, lambda_, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

RAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.8714932680130004
Total set size: 6397
Total coverage sets: 3484
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 1.2794
Average Coverage Rate in runs 1: 0.6968

Running experiment 2/10...
q_hat = 0.8679610311985014
Total set size: 6436
Total coverage sets: 3494
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 1.2872
Average Coverage Rate in runs 2: 0.6988

Running experiment 3/10...
q_hat = 0.8744889080524444
Total set size: 6417
Total coverage sets: 3484
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 1.2834
Average Coverage Rate in runs 3: 0.6968

Running experiment 4/10...
q_hat = 0.8685566127300262
Total set size: 6363
Total coverage sets: 3461
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 1.2726
Average Coverage Rate in runs 4: 0.6922

Running experiment 5/10...
q_hat = 0.8814487814903259
Total set size: 6

# $\alpha$=0.2

In [10]:
# error rate
alpha = 0.2
#Hyperparameter
lambda_ = 0.02
k_reg = 4

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_, k_reg, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, q_hat, lambda_, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

RAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.9650512099266053
Total set size: 9084
Total coverage sets: 4034
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 1.8168
Average Coverage Rate in runs 1: 0.8068

Running experiment 2/10...
q_hat = 0.9589962959289551
Total set size: 8896
Total coverage sets: 3970
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 1.7792
Average Coverage Rate in runs 2: 0.794

Running experiment 3/10...
q_hat = 0.9598138928413392
Total set size: 8910
Total coverage sets: 3986
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 1.782
Average Coverage Rate in runs 3: 0.7972

Running experiment 4/10...
q_hat = 0.9585777997970581
Total set size: 8770
Total coverage sets: 3964
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 1.754
Average Coverage Rate in runs 4: 0.7928

Running experiment 5/10...
q_hat = 0.964921748638153
Total set size: 9127


# $\alpha$=0.05

In [1]:
import torch
import torchvision.transforms as transforms              
from torchvision.datasets import CIFAR100      
from torch.utils.data import DataLoader
from src.inception_cifar100 import inceptionv3
from src.temperature_scaling import ModelWithTemperature
import numpy as np
from src.raps import split_data_set, raps_scores, raps_classification, eval_aps


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dict_path = "C:\\Users\\jiayang\\ipynb\\trainedModel\\Inception_CIFAR100.pth"
model = inceptionv3()
model.load_state_dict(torch.load(dict_path, map_location=device, weights_only=True))
model.to(device)

# preprocess the images from CIFAR10
data_transform = transforms.Compose([
    transforms.ToTensor(),         
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  
])

# load data set from CIFAR100
dataset = CIFAR100(root="../../data", train=False, download=True,transform=data_transform)

# Temperature Scaling
temp_scal_loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = ModelWithTemperature(model, temperature=0.5).to(device)
model.set_temperature(temp_scal_loader)
model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

Files already downloaded and verified
Before temperature - NLL: 1.378, ECE: 0.120
Optimal temperature: 0.633
After temperature - NLL: 1.865, ECE: 0.203


In [16]:
# error rate
alpha = 0.05
#Hyperparameter
lambda_ = 0.02
k_reg = 9

# construct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("RAPS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # split dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = raps_scores(model, calib_loader, alpha, lambda_, k_reg, device)
    q_hat = np.quantile(calib_scores, 1 - alpha)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = raps_classification(model, test_loader, q_hat, lambda_, k_reg, device)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)
final_set_size_std = np.std(all_avg_set_sizes, ddof=0)
final_coverage_std = np.std(all_avg_coverages, ddof=0)

print(f"Final Average Prediction Set Size: {final_avg_set_size:.2f} ± {final_set_size_std:.2f}")
print(f"Final Average Coverage: {final_avg_coverage:.4f} ± {final_coverage_std:.4f}")

RAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.9999985277652741
Total set size: 28544
Total coverage sets: 4775
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 5.7088
Average Coverage Rate in runs 1: 0.955

Running experiment 2/10...
q_hat = 0.9999951183795929
Total set size: 26830
Total coverage sets: 4756
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 5.366
Average Coverage Rate in runs 2: 0.9512

Running experiment 3/10...
q_hat = 0.9999740183353424
Total set size: 24009
Total coverage sets: 4709
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 4.8018
Average Coverage Rate in runs 3: 0.9418

Running experiment 4/10...
q_hat = 0.9999893337488175
Total set size: 25376
Total coverage sets: 4738
Total samples amount: 5000
Average Prediction Set Size After APS in runs 4: 5.0752
Average Coverage Rate in runs 4: 0.9476

Running experiment 5/10...
q_hat = 0.9999927788972854
Total set size:

#  Result
  
$\alpha$=0.1  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 3.08
- Final Average Coverage: 90.04%  

$\alpha$=0.3  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 1.28
- Final Average Coverage: 69.65%  

$\alpha$=0.2  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 1.78
- Final Average Coverage: 79.71% 

$\alpha$=0.05  
From the above test, following results can be collected :
- Final Average Prediction Set Size: 5.47
- Final Average Coverage: 94.90%