In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from zmq import device


torch.set_default_device("cuda")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # CLIP expects 224x224 images
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.2757])  # CLIP normalization
])


In [4]:
from torch.utils.data import Subset
cifar_data_train = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
cifar_data_test = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
calib_data = Subset(cifar_data_train, range(1000))  # for demonstration purposes
# train_data = Subset(cifar_data, range(250,1000))  # for demonstration purposes
test_data = Subset(cifar_data_test, range(2000))
calib_loader = DataLoader(calib_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [5]:
class_names = cifar_data_train.classes  # CIFAR-10 class names
text_inputs = processor(text=class_names, return_tensors="pt", padding=True)


In [6]:
class_names

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [7]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

def denormalize(img: torch.Tensor, mean: torch.Tensor, std: torch.Tensor):
    """Denormalizes the image given the mean and standard deviation."""
    return img * torch.tensor(std, device="cpu").view(3, 1, 1) + torch.tensor(mean, device="cpu").view(3, 1, 1)

# Forward pass through CLIP

all_labels = []
all_predictions = []
scores = []

# Compute nonconformity scores

for images, labels in calib_loader:

    pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
    
    # Process images using CLIP's processor (automatically normalizes them)
    inputs = processor(images=pil_images, return_tensors="pt").to("cuda")
    input_image_processed = inputs['pixel_values'].squeeze(0)

    outputs = model(**inputs, **text_inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
    predictions = probs.argmax(dim=1)
    all_labels.extend(labels.tolist())
    all_predictions.extend(predictions.tolist())
    scores += logits_per_image.take_along_dim(torch.tensor(labels).unsqueeze(-1),dim=1).squeeze().tolist()
    


print(scores)



  return func(*args, **kwargs)


[25.365983963012695, 26.455509185791016, 25.24193572998047, 25.602033615112305, 25.934118270874023, 27.183717727661133, 26.49964714050293, 29.578838348388672, 28.320213317871094, 28.197988510131836, 24.66593360900879, 25.59276580810547, 27.471696853637695, 28.724632263183594, 26.64838981628418, 28.15900230407715, 25.213777542114258, 27.232376098632812, 28.481191635131836, 29.22636604309082, 29.37438201904297, 26.899517059326172, 23.374515533447266, 25.622743606567383, 24.326801300048828, 28.506595611572266, 26.390483856201172, 25.771820068359375, 29.074594497680664, 21.491209030151367, 24.411792755126953, 27.261470794677734, 28.22585105895996, 25.439626693725586, 29.10232925415039, 25.411399841308594, 27.695823669433594, 27.114492416381836, 26.93214988708496, 25.90826988220215, 27.51641273498535, 27.503686904907227, 27.579116821289062, 29.838287353515625, 27.468711853027344, 28.365766525268555, 27.009803771972656, 24.789106369018555, 26.873308181762695, 24.865337371826172, 25.732494354

In [8]:
import numpy as np
alphas = [0.02, 0.05, 0.1, 0.2]
for alpha in alphas:
    print("\n\n")
    print(f"alpha =\t\t\t {alpha}")
    # Compute the quantile for the nonconformity scores
    n = len(scores)
    threshold = np.quantile(scores, np.ceil((n+1)*(alpha))/n, method="inverted_cdf")
    prediction_sets = []
    all_labels = []
    all_predictions = []

    for images, labels in test_loader:

        pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
        
        # Process images using CLIP's processor (automatically normalizes them)
        inputs = processor(images=pil_images, return_tensors="pt").to("cuda")
        input_image_processed = inputs['pixel_values'].squeeze(0)

        outputs = model(**inputs, **text_inputs)
        logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
        probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
        predictions = probs.argmax(dim=1)
        all_labels.extend(labels.tolist())
        all_predictions.extend(predictions.tolist())
        indices = (logits_per_image > threshold).nonzero(as_tuple=True)
        row_indices = [indices[1][indices[0] == i] for i in range(logits_per_image.size(0))]
        prediction_sets.extend(row_indices)

    pred_sets = [x.tolist() for x in prediction_sets]
    coverage = np.mean([all_labels[i] in pred_sets[i] for i in range(len(all_labels))])
    avg_set_size = np.mean([len(s) for s in pred_sets])
    median_set_size = np.median([len(s) for s in pred_sets])
    print(f"coverage =\t\t {coverage}")
    print(f"mean set size =\t\t {avg_set_size}")
    print(f"median set size =\t {median_set_size}")




alpha =			 0.02
coverage =		 0.9795
mean set size =		 3.5855
median set size =	 3.0



alpha =			 0.05
coverage =		 0.9585
mean set size =		 2.4295
median set size =	 2.0



alpha =			 0.1
coverage =		 0.8975
mean set size =		 1.585
median set size =	 1.0



alpha =			 0.2
coverage =		 0.812
mean set size =		 1.102
median set size =	 1.0
