# 1. load and split image set

In [1]:
from torchvision.datasets import ImageFolder              # for loading images from ImageNet
from torch.utils.data import DataLoader, random_split

def split_data_set(dataset, random_seed):
    if random_seed is not None:
        torch.manual_seed(random_seed)  # set input as random seed

    # split image set ---> half for calibration data set, half for test data set
    dataset_length = len(dataset)
    print(f"Samples amount: {dataset_length}")
    calib_length = dataset_length // 2               
    test_length = dataset_length - calib_length      

    calib_dataset, test_dataset = random_split(dataset, [calib_length, test_length])
    return calib_dataset, test_dataset

# 2. Calculate Conformal Score

In [2]:
import numpy as np

# conformal function s(x,y)
def conformal_scores(model, dataloader, alpha=0.1):
    scores = []  # conformal scores of image sets
    labels = []  # true label sets 
    with torch.no_grad():
        for images, true_labels in dataloader:
            images, true_labels = images.to(device), true_labels.to(device)
            # logistic value
            outputs = model(images)
            # logistic value -> softmax
            # dim=1 : convert logistic values for all the classes of the example to the softmax 
            softmaxs = torch.softmax(outputs, dim=1)
            
            for softmax, true_label in zip(softmaxs, true_labels):
                # descending sort softmax
                sorted_softmax, sorted_index = torch.sort(softmax, descending=True)
                
                # get the position of the true label in the sorted softmax
                true_label_position = (sorted_index == true_label).nonzero(as_tuple=True)[0].item()
                # independent random variable u ~ Uniform(0, 1)
                u = np.random.uniform(0, 1)
                # cumulate sorted softmax
                cumulative_softmax = torch.cumsum(sorted_softmax, dim=0)  # dim=0 -> cumulate by raw direction

                if true_label_position == 0:
                    conformal_score = u * sorted_softmax[true_label_position].item()  # first softmax is true label
                else:
                    conformal_score = cumulative_softmax[true_label_position - 1].item() + u * sorted_softmax[true_label_position].item()
                    
                scores.append(conformal_score)
                labels.append(true_label.item())
    return np.array(scores), np.array(labels)

## 3. Construct APS

In [3]:
def aps_classification(model, dataloader, q_hat):
    aps = []         # probability set
    aps_labels = []  # label set indicated to the probability set
    labels = []      # true label
    with torch.no_grad():
        for images, true_labels in dataloader:
            images, true_labels = images.to(device), true_labels.to(device)
            outputs = model(images)
            softmaxs = torch.softmax(outputs, dim=1)
            for softmax, true_label in zip(softmaxs, true_labels):
                sorted_softmax, sorted_index = torch.sort(softmax, descending=True)
                cumulative_softmax = torch.cumsum(sorted_softmax, dim=0)

                # cumulate until meet q_hat and then cut off
                cutoff_index = torch.searchsorted(cumulative_softmax, q_hat, right=True)
                cutoff_index = max(cutoff_index.item(), 1) # make sure cutoff_index >= 1

                # Select all the probabilities and corresponding labels until cut-off index
                prediction_set_prob = sorted_softmax[:cutoff_index].tolist()
                prediction_set_labels = sorted_index[:cutoff_index].tolist()

                aps.append(prediction_set_prob)
                aps_labels.append(prediction_set_labels)
                labels.append(true_label.item())
    return aps, aps_labels, labels

## 4. Evaluate Perdiction Set

In [4]:
def eval_aps(aps_labels,  true_labels):
    total_set_size = 0
    coveraged = 0
    for aps_label, true_label in zip(aps_labels, true_labels):
        # cumulate total set size
        total_set_size += len(aps_label)
        # cumulate the predictions sets if it contains true label
        if true_label in aps_label:
            coveraged += 1

    # calculate average values
    samples_amount = len(true_labels)
    average_set_size = total_set_size / samples_amount
    average_coverage = coveraged / samples_amount
    print(f"Total set size: {total_set_size}")
    print(f"Total coverage sets: {coveraged}")
    print(f"Total samples amount: {samples_amount}")
    return average_set_size, average_coverage

## 5. Construct APS repeatedly

In [7]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms               # include image preprocess tools
from torchvision.datasets import CIFAR100        # for loading images from Pytorch CIFAR
from torch.utils.data import DataLoader
import timm
import detectors

# check GPU status
print("Is CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load fine-tuned model
model = timm.create_model("resnet50_cifar100", pretrained=True)
model.to(device)

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1

# reprocess the images from CIFAR
data_transform = transforms.Compose([
    transforms.ToTensor(),          # transfer to tensor
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  # normalize
])
# load images from CIFAR10
dataset = CIFAR100(root="./data", train=False, download=True, transform=data_transform)

# contruct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("APS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # calculate q_hat
    calib_scores, _ = conformal_scores(model, calib_loader, alpha)
    q_hat = np.quantile(calib_scores, 1 - 0.1)  # calculate 1-alpha quantile
    print(f"q_hat = {q_hat}")

    # construct APS
    aps, aps_labels, true_labels = aps_classification(model, test_loader, q_hat)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Ratein runs {i+1}: {avg_coverage}\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)

print(f"Final Average Prediction Set Size: {final_avg_set_size}")
print(f"Final Average Coverage: {final_avg_coverage}")

Is CUDA available: True
Device count: 1
Device name: NVIDIA GeForce RTX 3060 Ti
Files already downloaded and verified
APS Classification, Start!

Running experiment 1/10...
Samples amount: 10000
q_hat = 0.9444153879825173
Total set size: 46392
Total coverage sets: 4595
Total samples amount: 5000
Average Prediction Set Size After APS in runs 1: 9.2784
Average Coverage Ratein runs 1: 0.919

Running experiment 2/10...
Samples amount: 10000
q_hat = 0.9396578859714947
Total set size: 44113
Total coverage sets: 4545
Total samples amount: 5000
Average Prediction Set Size After APS in runs 2: 8.8226
Average Coverage Ratein runs 2: 0.909

Running experiment 3/10...
Samples amount: 10000
q_hat = 0.937950385826204
Total set size: 41634
Total coverage sets: 4547
Total samples amount: 5000
Average Prediction Set Size After APS in runs 3: 8.3268
Average Coverage Ratein runs 3: 0.9094

Running experiment 4/10...
Samples amount: 10000
q_hat = 0.9347145246086751
Total set size: 39262
Total coverage set

# Result

- Final Average **Prediction Set Size： 8.76 / 100**
- Final Average **Coverage: 90.97% ($\alpha$=0.1)**