In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torchvision
import torchvision.transforms as transforms

# transformations for the test dataset
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# load CIFAR-10 test dataset
testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test
)

# CIFAR-10 class names
class_names = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

# define a function to unnormalize and convert the image back to a NumPy array
def unnormalize_image(image_tensor):
    mean = np.array([0.4914, 0.4822, 0.4465])
    std = np.array([0.2023, 0.1994, 0.2010])
    image = image_tensor.numpy().transpose((1, 2, 0))  # convert from (C, H, W) to (H, W, C)
    image = std * image + mean  # unnormalize
    image = np.clip(image, 0, 1)  # clip to valid range [0, 1]
    return image

# plot image with its label distribution
def plot_image(image_tensor, label):
    # unnormalize and convert image to NumPy format
    image = unnormalize_image(image_tensor)
    
    # plot image and label distribution
    plt.figure(figsize=(6, 3))

    # image
    plt.imshow(image)
    plt.axis('off')
    plt.title(class_names[label])

    plt.show()

# show examples for specific indices
for i in range(2): 
    image_tensor, label = testset[i]
    plot_image(image_tensor,label)

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
from models import *
import torch.backends.cudnn as cudnn

# model
net = EfficientNetB0()

net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

weights_path = "weights/EfficientNetB0_0.1_100_512_SGD_1" 

# load weights into the model
net.load_state_dict(torch.load(weights_path))
print("Model weights loaded successfully!")

In [None]:
criterion = nn.CrossEntropyLoss() 
net.eval()

In [None]:
calibration_size = 100
total_test_size = len(testset)  # 10000
eval_size = total_test_size-calibration_size

from torch.utils.data import Subset

# fix seed for reproducibility
seed = 42
generator = torch.Generator().manual_seed(seed)
indices = torch.randperm(total_test_size, generator=generator)

calibration_indices = indices[:calibration_size]
eval_indices = indices[calibration_size:]

calibration_subset = Subset(testset, calibration_indices)
eval_subset = Subset(testset, eval_indices)

print(len(calibration_subset))  
print(len(eval_subset))         


In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# configuration
lambda_reg = 5
k = 100 # sigmoid approx parameter
lr = 1e-3
num_classes = 10
batch_size = 64
num_epochs = 2000 

# neural network definition
class AlphaNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x).squeeze(-1)

# smoothed size function (sigmoid approximation)
def smooth_size(scores, alpha, T, n, k):
    frac = (n + 1) * scores / (T + scores)
    threshold = 1.0 / alpha
    soft_indicators = torch.sigmoid(-k * (frac - threshold))
    return soft_indicators.sum()

input_dim = num_classes + 1 # +1 corresponds to the sum of calibration scores

In [None]:
### run this cell only if you want to rebuild a training leave-one-out dataset
### otherwise just run the next cell to load one

print("Building training dataset (leave-one-out calibration)...")

train_inputs = []
train_sizes = []

# loop over calibration subset
for i in range(len(calibration_subset)):
    print(i)
    # define calibration set (all except i)
    calibration_scores = []
    with torch.no_grad():
        for j in range(len(calibration_subset)):
            if j == i:  # skip the test point
                continue
            x_calib, y_calib = calibration_subset[j]
            x_calib = x_calib.unsqueeze(0).to(device)
            y_calib = torch.tensor([y_calib], dtype=torch.long).to(device)
            logits = net(x_calib)
            score = criterion(logits, y_calib).item()
            calibration_scores.append(score)

    # compute sum of calibration scores
    T = torch.tensor(sum(calibration_scores), dtype=torch.float32).to(device)

    # take the i-th sample as the test point
    x_test, _ = calibration_subset[i]
    x_test = x_test.unsqueeze(0).to(device)

    with torch.no_grad():
        logits = net(x_test).squeeze(0)  # (num_classes,)
        scores = []
        for cls in range(num_classes):
            label_tensor = torch.tensor([cls], dtype=torch.long).to(device)
            score_cls = criterion(logits.unsqueeze(0), label_tensor).item()
            scores.append(score_cls)

    # build feature vector
    scores_tensor = torch.tensor(scores, dtype=torch.float32).to(device)
    input_feat = torch.cat([scores_tensor, T.view(1)])  # (num_classes + 1,)

    train_inputs.append(input_feat)
    train_sizes.append(scores_tensor)

print("Finished dataset generation.")

# convert to TensorDataset 
X_train = torch.stack(train_inputs)          # (N, num_classes + 1)
S_train = torch.stack(train_sizes)           # (N, num_classes)

train_dataset = TensorDataset(X_train, S_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# save training dataset
torch.save({
    'X_train': X_train,
    'S_train': S_train
}, 'loo_data.pt')

In [None]:
### uncomment below to load training leave-one-out dataset

# data = torch.load('loo_data.pt')

# X_train = data['X_train']
# S_train = data['S_train']

# train_dataset = TensorDataset(X_train, S_train)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# training neural network

all_losses = []
all_sizes = []
all_alphas = []

for run in range(5): # average over 5 runs
    print(f"Run {run+1}/5")
    torch.manual_seed(run)
    np.random.seed(run)

    losses_per_epoch = []
    sizes_per_epoch = []
    alphas_per_epoch = []

    alpha_net = AlphaNet(input_dim).to(device)
    optimizer = torch.optim.Adam(alpha_net.parameters(), lr=lr)

    for epoch in range(num_epochs):
        alpha_net.train()
        total_loss = 0
        total_size = 0
        total_alpha = 0

        for x_batch, s_batch in train_loader:
            x_batch = x_batch.to(device)
            s_batch = s_batch.to(device)
            T_batch = x_batch[:, -1]
            alpha_batch = alpha_net(x_batch)

            sizes = []
            for i in range(x_batch.size(0)):
                size_i = smooth_size(
                    scores=s_batch[i],
                    alpha=alpha_batch[i],
                    T=T_batch[i],
                    n=calibration_size-1,# /!\ n-1 here
                    k=k
                )
                sizes.append(size_i)

            sizes = torch.stack(sizes)
            losses = sizes + lambda_reg * alpha_batch
            loss = losses.mean()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_size += sizes.mean().item()
            total_alpha += alpha_batch.mean().item()

        avg_loss = total_loss / len(train_loader)
        avg_size = total_size / len(train_loader)
        avg_alpha = total_alpha / len(train_loader)

        losses_per_epoch.append(avg_loss)
        sizes_per_epoch.append(avg_size)
        alphas_per_epoch.append(avg_alpha)

        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | "
              f"Mean Size: {avg_size:.2f} | Mean Alpha: {avg_alpha:.4f}")

    all_losses.append(losses_per_epoch)
    all_sizes.append(sizes_per_epoch)
    all_alphas.append(alphas_per_epoch)

# torch.save(alpha_net.state_dict(), f"output/alpha_net_{lambda_reg}.pth")

# np.save(f"output/all_losses_{lambda_reg}.npy", np.array(all_losses))
# np.save(f"output/all_sizes_{lambda_reg}.npy", np.array(all_sizes))
# np.save(f"output/all_alphas_{lambda_reg}.npy", np.array(all_alphas))

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from tueplots import bundles

all_losses_5 = np.load("output/all_losses_5.npy")
all_sizes_5 = np.load("output/all_sizes_5.npy")
all_alphas_5 = np.load("output/all_alphas_5.npy")

all_losses_10 = np.load("output/all_losses_10.npy")
all_sizes_10 = np.load("output/all_sizes_10.npy")
all_alphas_10 = np.load("output/all_alphas_10.npy")

all_losses_50 = np.load("output/all_losses_50.npy")
all_sizes_50 = np.load("output/all_sizes_50.npy")
all_alphas_50 = np.load("output/all_alphas_50.npy")

# convert to arrays (usually already arrays, but safe)
all_losses_5 = np.array(all_losses_5)
all_losses_10 = np.array(all_losses_10)
all_losses_50 = np.array(all_losses_50)

all_sizes_5 = np.array(all_sizes_5)
all_sizes_10 = np.array(all_sizes_10)
all_sizes_50 = np.array(all_sizes_50)

all_alphas_5 = np.array(all_alphas_5)
all_alphas_10 = np.array(all_alphas_10)
all_alphas_50 = np.array(all_alphas_50)

# define moving average smoothing function
def moving_average(arr, window_size):
    return np.convolve(arr, np.ones(window_size)/window_size, mode='valid')

def smooth_all(arr_2d, window):
    smoothed_runs = []
    for run in arr_2d:
        smoothed = moving_average(run, window)
        smoothed_runs.append(smoothed)
    smoothed_runs = np.array(smoothed_runs)
    mean = np.nanmean(smoothed_runs, axis=0)
    std = np.nanstd(smoothed_runs, axis=0)
    return mean, std

window_size = 50

# compute smoothed means and stds
mean_loss_5, std_loss_5 = smooth_all(all_losses_5, window_size)
mean_loss_10, std_loss_10 = smooth_all(all_losses_10, window_size)
mean_loss_50, std_loss_50 = smooth_all(all_losses_50, window_size)

mean_size_5, std_size_5 = smooth_all(all_sizes_5, window_size)
mean_size_10, std_size_10 = smooth_all(all_sizes_10, window_size)
mean_size_50, std_size_50 = smooth_all(all_sizes_50, window_size)

mean_alpha_5, std_alpha_5 = smooth_all(all_alphas_5, window_size)
mean_alpha_10, std_alpha_10 = smooth_all(all_alphas_10, window_size)
mean_alpha_50, std_alpha_50 = smooth_all(all_alphas_50, window_size)

# adjust epochs due to smoothing window
epochs = np.arange(1, all_losses_5.shape[1] + 1)
epochs_smoothed = epochs[:len(mean_loss_5)]  # shortened by window_size - 1

# style setup
plt.rcParams.update(bundles.icml2024())
plt.rcParams.update({
    "axes.labelsize": 18,
    "axes.titlesize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "legend.fontsize": 16,
    "lines.linewidth": 2,
    "axes.linewidth": 2,
})

fig, axs = plt.subplots(1, 3, figsize=(14, 4))

# colors for lambda = 5, 10, 50
color_5 = "#0077bb"
color_10 = "#cc3311"
color_50 = "#44aa99" 

# plot 1: loss
axs[0].plot(epochs_smoothed, mean_loss_5, label=r'$\lambda=5$', color=color_5)
axs[0].fill_between(epochs_smoothed,
                    mean_loss_5 - std_loss_5,
                    mean_loss_5 + std_loss_5,
                    color=color_5, alpha=0.3)

axs[0].plot(epochs_smoothed, mean_loss_10, label=r'$\lambda=10$', color=color_10)
axs[0].fill_between(epochs_smoothed,
                    mean_loss_10 - std_loss_10,
                    mean_loss_10 + std_loss_10,
                    color=color_10, alpha=0.3)

axs[0].plot(epochs_smoothed, mean_loss_50, label=r'$\lambda=50$', color=color_50)
axs[0].fill_between(epochs_smoothed,
                    mean_loss_50 - std_loss_50,
                    mean_loss_50 + std_loss_50,
                    color=color_50, alpha=0.3)

axs[0].set_xlim(-100, 2100)
axs[0].set_ylim(0, 8)
axs[0].set_xlabel("Epoch")
axs[0].set_ylabel("Loss")
axs[0].set_title("Training Loss")
axs[0].grid(True)

# plot 2: mean size
axs[1].plot(epochs_smoothed, mean_size_5, label=r'$\lambda=5$', color=color_5)
axs[1].fill_between(epochs_smoothed,
                    mean_size_5 - std_size_5,
                    mean_size_5 + std_size_5,
                    color=color_5, alpha=0.3)

axs[1].plot(epochs_smoothed, mean_size_10, label=r'$\lambda=10$', color=color_10)
axs[1].fill_between(epochs_smoothed,
                    mean_size_10 - std_size_10,
                    mean_size_10 + std_size_10,
                    color=color_10, alpha=0.3)

axs[1].plot(epochs_smoothed, mean_size_50, label=r'$\lambda=50$', color=color_50)
axs[1].fill_between(epochs_smoothed,
                    mean_size_50 - std_size_50,
                    mean_size_50 + std_size_50,
                    color=color_50, alpha=0.3)

axs[1].set_xlim(-100, 2100)
axs[1].set_ylim(0, 5)
axs[1].set_xlabel("Epoch")
axs[1].set_ylabel("Mean Size")
axs[1].set_title("Mean Size per Epoch")
axs[1].grid(True)

# plot 3: mean miscoverage
axs[2].plot(epochs_smoothed, mean_alpha_5, label=r'$\lambda=5$', color=color_5)
axs[2].fill_between(epochs_smoothed,
                    mean_alpha_5 - std_alpha_5,
                    mean_alpha_5 + std_alpha_5,
                    color=color_5, alpha=0.3)

axs[2].plot(epochs_smoothed, mean_alpha_10, label=r'$\lambda=10$', color=color_10)
axs[2].fill_between(epochs_smoothed,
                    mean_alpha_10 - std_alpha_10,
                    mean_alpha_10 + std_alpha_10,
                    color=color_10, alpha=0.3)

axs[2].plot(epochs_smoothed, mean_alpha_50, label=r'$\lambda=50$', color=color_50)
axs[2].fill_between(epochs_smoothed,
                    mean_alpha_50 - std_alpha_50,
                    mean_alpha_50 + std_alpha_50,
                    color=color_50, alpha=0.3)

axs[2].set_xlim(-100, 2100)
axs[2].set_ylim(0, 0.25)
axs[2].set_xlabel("Epoch")
axs[2].set_ylabel(r"Mean $\tilde\alpha$")
axs[2].set_title(r"Mean $\tilde\alpha$ per Epoch")
axs[2].grid(True)
axs[2].legend(frameon=True)

# minor ticks and formatting
for ax in axs:
    ax.xaxis.set_minor_locator(AutoMinorLocator(5))  
    ax.yaxis.set_minor_locator(AutoMinorLocator(2))  

    ax.tick_params(which='both', length=4)
    ax.tick_params(which='minor', length=2, width=1.5)
    ax.tick_params(which='major', width=2)

plt.tight_layout()
plt.savefig("plots/training.pdf", format="pdf", bbox_inches="tight")
plt.show()


In [None]:
alpha_net = AlphaNet(input_dim).to(device)
alpha_net.load_state_dict(torch.load("output/alpha_net_50.pth", map_location=device))
alpha_net.eval()

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

num_trials = 100
coverages_adaptive, sizes_adaptive, alphas_adaptive = [], [], []
coverages_fixed, sizes_fixed = [], []
coverages_standard, sizes_standard = [], []

# compute calibration sum
cal_scores = []
with torch.no_grad():
    for x_c, y_c in calibration_subset:
        x_c = x_c.unsqueeze(0).to(device)
        y_c = torch.tensor([y_c], dtype=torch.long).to(device)
        logit = net(x_c)
        cal_scores.append(criterion(logit, y_c).item())
T = torch.tensor(sum(cal_scores)).to(device)

# helper: construct conformal set
def conformal_set(scores, T, alpha):
    frac = (calibration_size + 1) * scores / (T + scores)
    return [y for y in range(num_classes) if frac[y] <= 1/alpha]

# storage for reuse
all_scores, all_y_test = [], []

# e-adaptive evaluation
for trial in range(num_trials):
    # sample test point from eval_subset
    idx = np.random.choice(len(eval_subset))
    x_test, y_test = eval_subset[idx]
    x_test = x_test.unsqueeze(0).to(device)

    with torch.no_grad():
        logits = net(x_test).squeeze(0)

    # compute per-class scores
    scores = []
    for y in range(num_classes):
        y_tensor = torch.tensor([y], dtype=torch.long).to(device)
        score = criterion(logits.unsqueeze(0), y_tensor).item()
        scores.append(score)
    scores_tensor = torch.tensor(scores).to(device)

    # predict miscoverage
    input_feat = torch.cat([scores_tensor, T.view(1)])
    alpha_adapt = alpha_net(input_feat.unsqueeze(0)).item()

    # conformal set
    C_adapt = conformal_set(scores_tensor, T, alpha_adapt)

    coverages_adaptive.append(int(y_test in C_adapt))
    sizes_adaptive.append(len(C_adapt))
    alphas_adaptive.append(alpha_adapt)

    # store for reuse
    all_scores.append(scores_tensor)
    all_y_test.append(y_test)

# e-fixed
fixed_alpha = np.mean(alphas_adaptive)

for i in range(num_trials):
    scores_tensor = all_scores[i]
    y_test = all_y_test[i]

    C_fixed = conformal_set(scores_tensor, T, fixed_alpha)
    coverages_fixed.append(int(y_test in C_fixed))
    sizes_fixed.append(len(C_fixed))

# p-fixed
alpha = fixed_alpha  # use same alpha
quantile_index = int(np.ceil((1 - alpha) * (calibration_size + 1))) - 1
sorted_cal_scores = sorted(cal_scores)
threshold = sorted_cal_scores[quantile_index]

for i in range(num_trials):
    scores_tensor = all_scores[i]
    y_test = all_y_test[i]

    C_standard = [y for y in range(num_classes) if scores_tensor[y].item() <= threshold]
    coverages_standard.append(int(y_test in C_standard))
    sizes_standard.append(len(C_standard))

# results
print("=== e-adaptive ===")
print(f"Mean Alpha: {np.mean(alphas_adaptive):.4f}")
print(f"Empirical Coverage: {np.mean(coverages_adaptive):.4f}")
print(f"Expected Guarantee: {1 - np.mean(alphas_adaptive):.4f}")
print(f"Average Set Size: {np.mean(sizes_adaptive):.2f}")

print("=== e-fixed ===")
print(f"Fixed Alpha (mean adaptive): {fixed_alpha:.4f}")
print(f"Empirical Coverage: {np.mean(coverages_fixed):.4f}")
print(f"Expected Guarantee: {1 - fixed_alpha:.4f}")
print(f"Average Set Size: {np.mean(sizes_fixed):.2f}")

print("=== p-fixed ===")
print(f"Fixed Alpha (mean adaptive): {fixed_alpha:.4f}")
print(f"Empirical Coverage: {np.mean(coverages_standard):.4f}")
print(f"Expected Guarantee: {1 - fixed_alpha:.4f}")
print(f"Average Set Size: {np.mean(sizes_standard):.2f}")


In [None]:
import matplotlib.pyplot as plt
from tueplots import bundles
from matplotlib.ticker import AutoMinorLocator


# style setup
plt.rcParams.update(bundles.icml2024())
plt.rcParams.update({
    "axes.labelsize": 18,
    "axes.titlesize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "legend.fontsize": 16,
    "lines.linewidth": 2,
    "axes.linewidth": 2,
})

# plot histogram of alpha values
fig, ax = plt.subplots(figsize=(6, 3))
counts, bins, patches = ax.hist(np.array(alphas_adaptive),bins=15, color="#1f77b4",
                                 edgecolor='black', linewidth=1.5, alpha=0.8)

ax.set_xlabel(r"$\tilde\alpha$")
ax.set_ylabel("Frequency")
ax.grid(True, linestyle="--", alpha=0.5)

ax.xaxis.set_minor_locator(AutoMinorLocator(4))
ax.yaxis.set_minor_locator(AutoMinorLocator(2))
ax.tick_params(which='both', length=4)
ax.tick_params(which='minor', length=2, width=1.5)
ax.tick_params(which='major', width=2)

ax.set_xlim(0.03,0.082)
ax.set_ylim(0,23)

plt.tight_layout()
plt.savefig("plots/hist_alpha.pdf", format="pdf", bbox_inches="tight")
plt.show()


In [None]:
import matplotlib.pyplot as plt
from tueplots import bundles
from matplotlib.ticker import AutoMinorLocator

# style setup
plt.rcParams.update(bundles.icml2024())
plt.rcParams.update({
    "axes.labelsize": 18,
    "axes.titlesize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "legend.fontsize": 14,
    "lines.linewidth": 2,
    "axes.linewidth": 2,
})

# helper to count occurrences for integer sizes 1,...,10
def counts_for_sizes(sizes, min_size=1, max_size=10):
    arr = np.asarray(sizes).astype(int)
    arr = arr[(arr >= min_size) & (arr <= max_size)]
    counts = np.bincount(arr, minlength=max_size + 1)
    return counts[min_size:max_size + 1]

# discrete x ticks (set sizes)
x = np.arange(1, 11)        # 1,...,10
width = 0.28                # bar width; middle bar sits on the tick

# counts per size
counts_adaptive = counts_for_sizes(sizes_adaptive)
counts_fixed    = counts_for_sizes(sizes_fixed)
counts_standard = counts_for_sizes(sizes_standard)

# colors
c_adapt    = "#1f77b4"  # blue
c_fixed    = "#ff7f0e"  # orange
c_standard = "#2ca02c"  # green

fig, ax = plt.subplots(figsize=(6, 3))

# 3 bars per tick: left (e-adaptive), middle (e-fixed), right (p-fixed)
ax.bar(x - width, counts_adaptive, width=width, color=c_adapt,
       edgecolor="black", alpha=0.85, label="e-adaptive")
ax.bar(x,         counts_fixed,    width=width, color=c_fixed,
       edgecolor="black", alpha=0.85, label="e-fixed")    # centered
ax.bar(x + width, counts_standard, width=width, color=c_standard,
       edgecolor="black", alpha=0.85, label="p-fixed")

# vertical mean lines (matching colors)
mean_adaptive = np.mean(sizes_adaptive)
mean_fixed    = np.mean(sizes_fixed)
mean_standard = np.mean(sizes_standard)

ax.axvline(mean_adaptive, color=c_adapt, linestyle="--", linewidth=2,
           label=fr"Mean e-adaptive = {mean_adaptive:.2f}")
ax.axvline(mean_fixed,    color=c_fixed, linestyle="--", linewidth=2,
           label=fr"Mean e-fixed = {mean_fixed:.2f}")
ax.axvline(mean_standard, color=c_standard, linestyle="--", linewidth=2,
           label=fr"Mean p-fixed = {mean_standard:.2f}")

# labels & ticks
ax.set_xlabel("Set Size")
ax.set_ylabel("Frequency")
ax.set_xticks(x)
ax.grid(True, linestyle="--", alpha=0.5)

ax.yaxis.set_minor_locator(AutoMinorLocator(2))
ax.tick_params(which='both', length=4)
ax.tick_params(which='minor', length=2, width=1.5)
ax.tick_params(which='major', width=2)

handles, labels = ax.get_legend_handles_labels()
order = [3, 4, 5, 0, 1, 2] 
ax.legend([handles[i] for i in order], [labels[i] for i in order], frameon=True)

plt.tight_layout()
plt.savefig("plots/hist_set_sizes.pdf", format="pdf", bbox_inches="tight")
plt.show()