In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model + processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# Load your image (can be any format)
image = Image.open(r"..\data\Chest X-Rays\val\COVID19\COVID19(568).jpg").convert("RGB")

# Define candidate labels (text descriptions)
text_descriptions = [
    "a chest X-ray",
    "a CT scan",
    "a natural photograph",
    "a painting",
    "a selfie",
    "a dog",
    "a screenshot"
]

# Prepare inputs
inputs = processor(text=text_descriptions, images=image, return_tensors="pt", padding=True)

In [10]:
# Run model
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # shape: [1, N_texts]
probs = logits_per_image.softmax(dim=1)      # softmax over text options

# Get top match
top_index = torch.argmax(probs)
top_label = text_descriptions[top_index]

print(f"Top label: {top_label}")
print(f"Confidence: {probs[0, top_index].item():.4f}")

Top label: a chest X-ray
Confidence: 0.9951
