# 1. Split Dataset

In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader,  random_split

def split_data_set(dataset, random_seed):
    if random_seed is not None:
        torch.manual_seed(random_seed)  # set input as random seed

    # split image set ---> half for calibration data set, half for test data set
    dataset_length = len(dataset)
    print(f"Samples amount: {dataset_length}")
    calib_length = dataset_length // 2               
    test_length = dataset_length - calib_length      

    calib_dataset, test_dataset = random_split(dataset, [calib_length, test_length])
    return calib_dataset, test_dataset

# 2. Conformal Score Function

Add an independent random variable satisfying a uniform distribution on [0,1], in order to decrease q_hat:

- Noncomformity function without u:
  
  $S_{\text{APS}}(x, y, u; \hat{\pi}) = \sum_{i=1}^{o(y, \hat{\pi}(x))} \hat{\pi}_i(x)$

- Noncomformity function with u:

  $S_{\text{APS}}(x, y, u; \hat{\pi}) = \sum_{i=1}^{o(y, \hat{\pi}(x)) - 1} \hat{\pi}_i(x) + u \cdot \hat{\pi}_{o(y, \hat{\pi}(x))}$ ,

  where ${o(y, \hat{\pi}(x))}$ indicates the index of true label in the sorted softmax proabilities.

In [7]:
import numpy as np

# conformal function s(x,y)
def conformal_scores(model, dataloader, alpha=0.1):
    scores = []  # conformal scores of image sets
    labels = []  # true label sets 
    with torch.no_grad():
        for images, true_labels in dataloader:
            images, true_labels = images.to(device), true_labels.to(device)
            # logistic value
            outputs = model(images)
            # logistic value -> softmax
            # dim=1 : convert logistic values for all the classes of the example to the softmax 
            softmaxs = torch.softmax(outputs, dim=1)
            
            for softmax, true_label in zip(softmaxs, true_labels):
                # descending sort softmax
                sorted_softmax, sorted_index = torch.sort(softmax, descending=True)
                
                # get the position of the true label in the sorted softmax
                true_label_position = (sorted_index == true_label).nonzero(as_tuple=True)[0].item()
                # independent random variable u ~ Uniform(0, 1)
                u = np.random.uniform(0, 1)
                # cumulate sorted softmax
                cumulative_softmax = torch.cumsum(sorted_softmax, dim=0)  # dim=0 -> cumulate by raw direction

                if true_label_position == 0:
                    conformal_score = u * sorted_softmax[true_label_position].item()  # first softmax is true label
                else:
                    conformal_score = cumulative_softmax[true_label_position - 1].item() + u * sorted_softmax[true_label_position].item()
                    
                scores.append(conformal_score)
                labels.append(true_label.item())
    return np.array(scores), np.array(labels)

# 3. Construct APS

In [4]:
def aps_classification(model, dataloader, q_hat):
    aps = []         # probability set
    aps_labels = []  # label set indicated to the probability set
    labels = []      # true label
    with torch.no_grad():
        for images, true_labels in dataloader:
            images, true_labels = images.to(device), true_labels.to(device)
            outputs = model(images)
            softmaxs = torch.softmax(outputs, dim=1)
            for softmax, true_label in zip(softmaxs, true_labels):
                sorted_softmax, sorted_index = torch.sort(softmax, descending=True)
                cumulative_softmax = torch.cumsum(sorted_softmax, dim=0)

                # cumulate until meet q_hat and then cut off
                cutoff_index = torch.searchsorted(cumulative_softmax, q_hat, right=True)
                cutoff_index = max(cutoff_index.item(), 1) # make sure cutoff_index >= 1

                # Select all the probabilities and corresponding labels until cut-off index
                prediction_set_prob = sorted_softmax[:cutoff_index].tolist()
                prediction_set_labels = sorted_index[:cutoff_index].tolist()

                aps.append(prediction_set_prob)
                aps_labels.append(prediction_set_labels)
                labels.append(true_label.item())
    return aps, aps_labels, labels

# 4. Evaluate Prediction Set

In [5]:
def eval_aps(aps_labels,  true_labels):
    total_set_size = 0
    coveraged = 0
    for aps_label, true_label in zip(aps_labels, true_labels):
        # cumulate total set size
        total_set_size += len(aps_label)
        # cumulate the predictions sets if it contains true label
        if true_label in aps_label:
            coveraged += 1

    # calculate average values
    samples_amount = len(true_labels)
    average_set_size = total_set_size / samples_amount
    average_coverage = coveraged / samples_amount
    print(f"Total set size: {total_set_size}")
    print(f"Total coverage sets: {coveraged}")
    print(f"Total samples amount: {samples_amount}")
    return average_set_size, average_coverage

# 5. Execute Prediction Repeatedly

In [6]:
import torch
import torch.nn as nn
import torchvision.models as models

# check GPU status
print("Is CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

# load pre-trained model InceptionV3 and set mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.inception_v3(pretrained=True).to(device)
model.eval()

# The number of times the experiment is going to be repeated
num_runs = 10

# error rate
alpha = 0.1


#  Reprocess: Center Crop and then resize to 299*299
data_transform = transforms.Compose([
    transforms.CenterCrop(299),
    transforms.Resize((299, 299)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

sorted_val_path = "D:\\Download\\ImageNet-1K\\Validation_Set\\sorted_ImageNet_val"
dataset = ImageFolder(root=sorted_val_path, transform=data_transform)


# contruct and evaluate repeatedly
all_avg_set_sizes = []
all_avg_coverages = []
print("APS Classification, Start!\n")
for i in range(num_runs):
    print(f"Running experiment {i+1}/{num_runs}...")

    # splite dataset
    calib_dataset, test_dataset = split_data_set(dataset, random_seed=i)

    # load data set respectively
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False, num_workers=1)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=1)
    
    # calculate q_hat
    calib_scores, _ = conformal_scores(model, calib_loader, alpha)
    q_hat = np.quantile(calib_scores, (len(calib_scores) + 1) / len(calib_scores) * (1 - 0.1), method='higher')
    print(f"q_hat = {q_hat}")
    
    # construct APS
    _, aps_labels, true_labels = aps_classification(model, test_loader, q_hat)

    # evaluate APS
    avg_set_size, avg_coverage = eval_aps(aps_labels, true_labels)
    print(f"Average Prediction Set Size After APS in runs {i+1}: {avg_set_size}")
    print(f"Average Coverage Rate in runs {i+1}: {100 * avg_coverage:.2f}%\n")

    # record current result
    all_avg_set_sizes.append(avg_set_size)
    all_avg_coverages.append(avg_coverage)

# calculate the final average result
final_avg_set_size = np.mean(all_avg_set_sizes)
final_avg_coverage = np.mean(all_avg_coverages)

print(f"Final Average Prediction Set Size: {final_avg_set_size}")
print(f"Final Average Coverage: {final_avg_coverage}")


Is CUDA available: True
Device count: 1
Device name: NVIDIA GeForce RTX 3060 Ti
APS Classification, Start!

Running experiment 1/10...
Samples amount: 50000
q_hat = 0.8704919669572897
Total set size: 1322087
Total coverage sets: 22945
Total samples amount: 25000
Average Prediction Set Size After APS in runs 1: 52.88348
Average Coverage Rate in runs 1: 91.78%

Running experiment 2/10...
Samples amount: 50000
q_hat = 0.8719408847124867
Total set size: 1315308
Total coverage sets: 22913
Total samples amount: 25000
Average Prediction Set Size After APS in runs 2: 52.61232
Average Coverage Rate in runs 2: 91.65%

Running experiment 3/10...
Samples amount: 50000
q_hat = 0.8729933277383137
Total set size: 1336983
Total coverage sets: 23014
Total samples amount: 25000
Average Prediction Set Size After APS in runs 3: 53.47932
Average Coverage Rate in runs 3: 92.06%

Running experiment 4/10...
Samples amount: 50000
q_hat = 0.8738217781444307
Total set size: 1395387
Total coverage sets: 22990
Tot

# 6. Result

From the above test, following results can be collected :
- Final Average Prediction Set Size: 53.82 / 1000
- Final Average Coverage: 91.87% ($\alpha$=0.1)