# $\alpha$=0.1

In [1]:
import torch
import torchvision.transforms as transforms              
from torchvision.datasets import ImageFolder
import torchvision.models as models
from torchvision.models import VGG16_BN_Weights
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import Subset
from src.temperature_scaling import ModelWithTemperature
from src.saps import saps_test

# load pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.vgg16_bn(weights=VGG16_BN_Weights.IMAGENET1K_V1).to(device)

#  Reprocess
data_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

sorted_val_path = "D:\\Download\\ImageNet-1K\\Validation_Set\\sorted_ImageNet_val"
dataset = ImageFolder(root=sorted_val_path, transform=data_transform)

# Temperature Scaling
subset_size = len(dataset) // 10
indices = np.random.choice(len(dataset), subset_size, replace=False)
subset_dataset = Subset(dataset, indices)
train_loader = DataLoader(subset_dataset, batch_size=32, shuffle=False, num_workers=4)

model = ModelWithTemperature(model, temperature = 1.0).to(device)
model.set_temperature(train_loader)
model.eval()

saps_test(model, dataset, device, num_runs=10, alpha=0.1, lambda_=0.15)

Before temperature - NLL: 1.187, ECE: 0.021
Optimal temperature: 1.020
After temperature - NLL: 1.186, ECE: 0.021


SAPS Classification, Start!

Running experiment 1/10...
q_hat = 1.028223049640656
Average Prediction Set Size After APS in runs 1: 2.87788
Average Coverage Rate in runs 1: 89.77%

Running experiment 2/10...
q_hat = 1.0320658087730414
Average Prediction Set Size After APS in runs 2: 2.88432
Average Coverage Rate in runs 2: 89.87%

Running experiment 3/10...
q_hat = 1.036527860164644
Average Prediction Set Size After APS in runs 3: 2.93344
Average Coverage Rate in runs 3: 90.14%

Running experiment 4/10...
q_hat = 1.0261310458183295
Average Prediction Set Size After APS in runs 4: 2.872
Average Coverage Rate in runs 4: 89.78%

Running experiment 5/10...
q_hat = 1.0209815979003907
Average Prediction Set Size After APS in runs 5: 2.82352
Average Coverage Rate in runs 5: 89.62%

Running experiment 6/10...
q_hat = 1.0354617595672608
Average Prediction Set Size After APS in runs

# $\alpha$=0.2

In [2]:
saps_test(model, dataset, device, num_runs=10, alpha=0.2, lambda_=0.1)



SAPS Classification, Start!

Running experiment 1/10...
q_hat = 0.8143581986427307
Average Prediction Set Size After APS in runs 1: 2.41864
Average Coverage Rate in runs 1: 80.01%

Running experiment 2/10...
q_hat = 0.8116831660270691
Average Prediction Set Size After APS in runs 2: 2.378
Average Coverage Rate in runs 2: 79.52%

Running experiment 3/10...
q_hat = 0.8213530778884889
Average Prediction Set Size After APS in runs 3: 2.45072
Average Coverage Rate in runs 3: 80.90%

Running experiment 4/10...
q_hat = 0.8113918662071229
Average Prediction Set Size After APS in runs 4: 2.40624
Average Coverage Rate in runs 4: 79.76%

Running experiment 5/10...
q_hat = 0.8135486006736756
Average Prediction Set Size After APS in runs 5: 2.40684
Average Coverage Rate in runs 5: 79.79%

Running experiment 6/10...
q_hat = 0.8165716886520386
Average Prediction Set Size After APS in runs 6: 2.4184
Average Coverage Rate in runs 6: 80.08%

Running experiment 7/10...
q_hat = 0.8092804074287415
Averag

# $\alpha$=0.05

In [3]:
saps_test(model, dataset, device, num_runs=10, alpha=0.05, lambda_=0.05)



SAPS Classification, Start!

Running experiment 1/10...
q_hat = 1.0314024448394774
Average Prediction Set Size After APS in runs 1: 6.70004
Average Coverage Rate in runs 1: 94.83%

Running experiment 2/10...
q_hat = 1.0423207461833952
Average Prediction Set Size After APS in runs 2: 6.86544
Average Coverage Rate in runs 2: 95.10%

Running experiment 3/10...
q_hat = 1.0362446844577788
Average Prediction Set Size After APS in runs 3: 6.78708
Average Coverage Rate in runs 3: 95.04%

Running experiment 4/10...
q_hat = 1.0310059666633604
Average Prediction Set Size After APS in runs 4: 6.70648
Average Coverage Rate in runs 4: 94.88%

Running experiment 5/10...
q_hat = 1.0271119594573974
Average Prediction Set Size After APS in runs 5: 6.6042
Average Coverage Rate in runs 5: 94.76%

Running experiment 6/10...
q_hat = 1.0318053424358367
Average Prediction Set Size After APS in runs 6: 6.70572
Average Coverage Rate in runs 6: 94.87%

Running experiment 7/10...
q_hat = 1.0395117700099945
Aver

## Result  
$\alpha$=0.1  
From the above test, following results can be collected :
- Final Average Prediction Set Size: **2.89**
- Final Average Coverage: **89.87%**  

$\alpha$=0.2  
From the above test, following results can be collected :
- Final Average Prediction Set Size: **2.41**
- Final Average Coverage: **79.98%**  

$\alpha$=0.05  
From the above test, following results can be collected :
- Final Average Prediction Set Size: **6.77**
- Final Average Coverage: **94.96%**