# 🔍 Compare Trained Model vs Zero-Shot CLIP

In [1]:
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
from tqdm import tqdm


In [2]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model = clip_model.to(device)


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [3]:
class_names = [
    "a photo of a butterfly",
    "a photo of a cat",
    "a photo of a chicken",
    "a photo of a cow",
    "a photo of a dog",
    "a photo of an elephant",
    "a photo of a horse",
    "a photo of a sheep",
    "a photo of a spider",
    "a photo of a squirrel"
]


In [4]:
def clip_predict(image):
    inputs = clip_processor(text=class_names, images=image, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)
        return class_names[probs.argmax().item()], probs


In [6]:
correct_trained = 0
correct_clip = 0
total = 0

# Sample 100 test examples
sample_dataset = dataset['test'].select(range(100))

for example in tqdm(sample_dataset):
    image = example['image']
    true_label = example['label']

    # Your model prediction
    model_inputs = processor(image.convert("RGB"), return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**model_inputs).logits.argmax(dim=1).item()

    # CLIP prediction
    pred_clip_label, _ = clip_predict(image)
    clip_label_index = class_names.index(pred_clip_label)

    if output == true_label:
        correct_trained += 1
    if clip_label_index == true_label:
        correct_clip += 1
    total += 1

print(f"Trained model accuracy: {correct_trained / total:.2%}")
print(f"CLIP zero-shot accuracy: {correct_clip / total:.2%}")


NameError: name 'dataset' is not defined