In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torchvision
import torchvision.transforms as transforms

# transformations for the test dataset
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# load CIFAR-10 test dataset
testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test
)

# CIFAR-10 class names
class_names = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

# define a function to unnormalize and convert the image back to a NumPy array
def unnormalize_image(image_tensor):
    mean = np.array([0.4914, 0.4822, 0.4465])
    std = np.array([0.2023, 0.1994, 0.2010])
    image = image_tensor.numpy().transpose((1, 2, 0))  # convert from (C, H, W) to (H, W, C)
    image = std * image + mean  # unnormalize
    image = np.clip(image, 0, 1)  # clip to valid range [0, 1]
    return image

# plot image with its label distribution
def plot_image(image_tensor, label):
    # unnormalize and convert image to NumPy format
    image = unnormalize_image(image_tensor)
    
    # plot image and label distribution
    plt.figure(figsize=(6, 3))

    # image
    plt.imshow(image)
    plt.axis('off')
    plt.title(class_names[label])

    plt.show()

# show examples for specific indices
for i in range(2): 
    image_tensor, label = testset[i]
    plot_image(image_tensor,label)

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
from models import *
import torch.backends.cudnn as cudnn

# model
net = EfficientNetB0()

net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

weights_path = "weights/EfficientNetB0_0.1_100_512_SGD_1" 

# load weights into the model
net.load_state_dict(torch.load(weights_path))
print("Model weights loaded successfully!")

In [None]:
criterion = nn.CrossEntropyLoss() 
net.eval()

In [None]:
calibration_size = 100
total_test_size = len(testset)  # 10000
eval_size = total_test_size-calibration_size

from torch.utils.data import Subset

# fix seed for reproducibility
seed = 42
generator = torch.Generator().manual_seed(seed)
indices = torch.randperm(total_test_size, generator=generator)

calibration_indices = indices[:calibration_size]
eval_indices = indices[calibration_size:]

calibration_subset = Subset(testset, calibration_indices)
eval_subset = Subset(testset, eval_indices)

print(len(calibration_subset))  
print(len(eval_subset))         


In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# configuration
k = 100 # sigmoid approx parameter
lr = 1e-3
num_classes = 10
batch_size = 64
num_epochs = 2000 

# neural network definition
class AlphaNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x).squeeze(-1)

# smoothed size function (sigmoid approximation)
def smooth_size(scores, alpha, T, n, k):
    frac = (n + 1) * scores / (T + scores)
    threshold = 1.0 / alpha
    soft_indicators = torch.sigmoid(-k * (frac - threshold))
    return soft_indicators.sum()

input_dim = num_classes + 1 # +1 corresponds to the sum of calibration scores

In [None]:
### run this cell only if you want to rebuild a training leave-one-out dataset
### otherwise just run the next cell to load one

print("Building training dataset (leave-one-out calibration)...")

train_inputs = []
train_sizes = []

# loop over calibration subset
for i in range(len(calibration_subset)):
    print(i)
    # define calibration set (all except i)
    calibration_scores = []
    with torch.no_grad():
        for j in range(len(calibration_subset)):
            if j == i:  # skip the test point
                continue
            x_calib, y_calib = calibration_subset[j]
            x_calib = x_calib.unsqueeze(0).to(device)
            y_calib = torch.tensor([y_calib], dtype=torch.long).to(device)
            logits = net(x_calib)
            score = criterion(logits, y_calib).item()
            calibration_scores.append(score)

    # compute sum of calibration scores
    T = torch.tensor(sum(calibration_scores), dtype=torch.float32).to(device)

    # take the i-th sample as the test point
    x_test, _ = calibration_subset[i]
    x_test = x_test.unsqueeze(0).to(device)

    with torch.no_grad():
        logits = net(x_test).squeeze(0)  # (num_classes,)
        scores = []
        for cls in range(num_classes):
            label_tensor = torch.tensor([cls], dtype=torch.long).to(device)
            score_cls = criterion(logits.unsqueeze(0), label_tensor).item()
            scores.append(score_cls)

    # build feature vector
    scores_tensor = torch.tensor(scores, dtype=torch.float32).to(device)
    input_feat = torch.cat([scores_tensor, T.view(1)])  # (num_classes + 1,)

    train_inputs.append(input_feat)
    train_sizes.append(scores_tensor)

print("Finished dataset generation.")

# convert to TensorDataset 
X_train = torch.stack(train_inputs)          # (N, num_classes + 1)
S_train = torch.stack(train_sizes)           # (N, num_classes)

train_dataset = TensorDataset(X_train, S_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# save training dataset
torch.save({
    'X_train': X_train,
    'S_train': S_train
}, 'loo_data.pt')

In [None]:
### uncomment below to load training leave-one-out dataset

# data = torch.load('loo_data.pt')

# X_train = data['X_train']
# S_train = data['S_train']

# train_dataset = TensorDataset(X_train, S_train)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
import torch
import numpy as np

# parameters for lambda-selection
M = 2           # target mean size
epsilon = 0.1   # tolerance for bisection
max_iters = 20  # max iterations for bracketing/bisection to avoid infinite loops

# set seed
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)

# lists to track values
lambda_history = []
mean_sizes_history = []
batch_sizes_history = []  # list of lists; each element is batch sizes for that lambda

# function to train alpha_net for one lambda and return batch averages & mean size
def train_and_compute_mean_size(lambda_reg, num_epochs=num_epochs):
    alpha_net = AlphaNet(input_dim).to(device)
    optimizer = torch.optim.Adam(alpha_net.parameters(), lr=lr)

    # to store average sizes per batch during training
    avg_sizes_per_batch = []

    for epoch in range(num_epochs):
        if epoch % 200 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}")
        alpha_net.train()
        for x_batch, s_batch in train_loader:
            x_batch = x_batch.to(device)
            s_batch = s_batch.to(device)
            T_batch = x_batch[:, -1]
            alpha_batch = alpha_net(x_batch)

            sizes = []
            for i in range(x_batch.size(0)):
                size_i = smooth_size(
                    scores=s_batch[i],
                    alpha=alpha_batch[i],
                    T=T_batch[i],
                    n=calibration_size-1,
                    k=k
                )
                sizes.append(size_i)

            sizes = torch.stack(sizes)
            avg_sizes_per_batch.append(sizes.mean().item())  # track per batch

            # loss and backprop
            loss = (sizes + lambda_reg * alpha_batch).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # compute mean size over calibration set
    alpha_net.eval()
    all_sizes = []
    with torch.no_grad():
        for x_batch, s_batch in train_loader:
            x_batch = x_batch.to(device)
            s_batch = s_batch.to(device)
            T_batch = x_batch[:, -1]
            alpha_batch = alpha_net(x_batch)

            sizes = []
            for i in range(x_batch.size(0)):
                size_i = smooth_size(
                    scores=s_batch[i],
                    alpha=alpha_batch[i],
                    T=T_batch[i],
                    n=calibration_size-1,
                    k=k
                )
                sizes.append(size_i)

            sizes = torch.stack(sizes)
            all_sizes.append(sizes.mean().item())

    mean_size = np.mean(all_sizes)

    return alpha_net, mean_size, avg_sizes_per_batch


# step 1: Expansion phase to bracket M
lambda_reg = 40  # initial guess
iter_count = 0
lambda_low, lambda_high = None, None

while iter_count < max_iters:
    iter_count += 1
    alpha_net, mean_size, avg_sizes_per_batch = train_and_compute_mean_size(lambda_reg)

    # track values
    lambda_history.append(lambda_reg)
    mean_sizes_history.append(mean_size)
    batch_sizes_history.append(avg_sizes_per_batch)

    print(f"Iteration {iter_count}: λ={lambda_reg:.5f}, mean_size={mean_size:.5f}")

    if mean_size < M:
        lambda_low = lambda_reg  # last lambda that is below M
        if lambda_high is not None:  # if we already have an upper bound, we are done
            break
        lambda_reg *= 2
    else:
        lambda_high = lambda_reg  # last lambda that is above M
        if lambda_low is not None:  # if we already have a lower bound, we are done
            break
        lambda_reg /= 2

print(f"Bracket found: λ_low={lambda_low}, λ_high={lambda_high}, mean_size={mean_size:.5f}")


# step 2: Bisection refinement
iter_count = 0
while iter_count < max_iters:
    iter_count += 1
    lambda_reg = (lambda_low + lambda_high) / 2
    alpha_net, mean_size, avg_sizes_per_batch = train_and_compute_mean_size(lambda_reg)

    # track values
    lambda_history.append(lambda_reg)
    mean_sizes_history.append(mean_size)
    batch_sizes_history.append(avg_sizes_per_batch)

    print(f"Iteration {iter_count}: λ={lambda_reg:.5f}, mean_size={mean_size:.5f}")

    if abs(mean_size - M) <= epsilon:
        break
    elif mean_size < M:
        lambda_low = lambda_reg
    else:
        lambda_high = lambda_reg

print(f"Selected λ: {lambda_reg:.5f}, mean_size={mean_size:.5f}")

In [None]:
final_filename = f"output/alpha_net_bracket_bisect.pth"
torch.save(alpha_net.state_dict(), final_filename)
print(f"Saved final model weights -> {final_filename}")

In [None]:
# save history
history = {
    "lambda_history": np.array(lambda_history),
    "mean_sizes_history": np.array(mean_sizes_history),
    "batch_sizes_history": batch_sizes_history,  # list of lists
    "M": M,
    "epsilon": epsilon
}

# save with numpy (binary .npz file)
np.savez("output/lambda_selection_history.npz", **history)
print("History saved: lambda_selection_history.npz")

In [None]:
# load the .npz file
data = np.load("output/lambda_selection_history.npz", allow_pickle=True)

lambda_history = data["lambda_history"]
mean_sizes_history = data["mean_sizes_history"]
batch_sizes_history = data["batch_sizes_history"]  # stored as object array
M = data["M"].item()
epsilon = data["epsilon"].item()

print("λ history:", lambda_history)
print("Mean sizes:", mean_sizes_history)
print("Batch sizes first run:", batch_sizes_history[0])
print("Target M:", M, "Tolerance ε:", epsilon)

In [None]:
num_trials = 100
coverages_adaptive, sizes_adaptive, alphas_adaptive = [], [], []
coverages_fixed, sizes_fixed = [], []
coverages_standard, sizes_standard = [], []

# set seed 
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)

# compute calibration scores once
cal_scores = []
with torch.no_grad():
    for x_c, y_c in calibration_subset:
        x_c = x_c.unsqueeze(0).to(device)
        y_c = torch.tensor([y_c], dtype=torch.long).to(device)
        logit = net(x_c)
        cal_scores.append(criterion(logit, y_c).item())
T = torch.tensor(sum(cal_scores)).to(device)

# helper: construct conformal set
def conformal_set(scores, T, alpha):
    frac = (calibration_size + 1) * scores / (T + scores)
    return [y for y in range(num_classes) if frac[y] <= 1/alpha]

# storage for reuse
all_scores, all_y_test = [], []

# e-adaptive evaluation
for trial in range(num_trials):
    # sample test point from eval_subset
    idx = np.random.choice(len(eval_subset))
    x_test, y_test = eval_subset[idx]
    x_test = x_test.unsqueeze(0).to(device)

    with torch.no_grad():
        logits = net(x_test).squeeze(0)

    # compute per-class scores
    scores = []
    for y in range(num_classes):
        y_tensor = torch.tensor([y], dtype=torch.long).to(device)
        score = criterion(logits.unsqueeze(0), y_tensor).item()
        scores.append(score)
    scores_tensor = torch.tensor(scores).to(device)

    # predict miscoverage
    input_feat = torch.cat([scores_tensor, T.view(1)])
    alpha_adapt = alpha_net(input_feat.unsqueeze(0)).item()

    # conformal set
    C_adapt = conformal_set(scores_tensor, T, alpha_adapt)

    coverages_adaptive.append(int(y_test in C_adapt))
    sizes_adaptive.append(len(C_adapt))
    alphas_adaptive.append(alpha_adapt)

# results
print(f"Mean Alpha: {np.mean(alphas_adaptive):.4f}")
print(f"Empirical Coverage: {np.mean(coverages_adaptive):.4f}")
print(f"Expected Guarantee: {1 - np.mean(alphas_adaptive):.4f}")
print(f"Average Set Size: {np.mean(sizes_adaptive):.2f}")


In [None]:
import matplotlib.pyplot as plt
from tueplots import bundles
import numpy as np

# style setup
plt.rcParams.update(bundles.icml2024())
plt.rcParams.update({
    "axes.labelsize": 18,
    "axes.titlesize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "legend.fontsize": 14,
    "lines.linewidth": 2,
    "axes.linewidth": 2,
})

iterations = np.arange(1, len(lambda_history)+1)

plt.figure(figsize=(6,4))
plt.plot(iterations, mean_sizes_history, marker="o", label="Mean Size",linewidth=4,markersize=10)

xshift = [0.1,0.15,0.1,0.05,-0.4] # adjust for visualization purpose
yshift = [-0.1,-0.7,-0.6,0.3,-0.9]
for i, (it, mean_size, lam) in enumerate(zip(iterations, mean_sizes_history, lambda_history)):
    plt.text(it + xshift[i], mean_size+yshift[i], f"$\lambda$={lam:.0f}", fontsize=16, color="black")

# target line
plt.axhline(y=M, color="red", linestyle="--", label=f"Target M={M}",linewidth=3)

# tolerance band (transparent red)
plt.fill_between(
    np.array([iterations[0] - 1, iterations[-1] + 1]), # iterations,
    M - epsilon,
    M + epsilon,
    color="red",
    alpha=0.2,
    label=f"Tolerance ±{epsilon}"
)

plt.ylim(0, 10.5)
plt.xlim(0.8,iterations[-1]+0.2)
plt.xlabel("Iteration")
plt.ylabel("Mean Size")
plt.xticks(iterations)
plt.legend(frameon=True)
plt.grid(True)
plt.tight_layout()
plt.savefig("plots/algo_lambda.pdf", format="pdf", bbox_inches="tight")
plt.show()
