In [1]:
%pip install transformers datasets torchvision

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import CLIPProcessor, CLIPModel
import torch

# Load pretrained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

2025-05-22 19:27:36.239377: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
# Constants

DOWNLOAD_IMAGES = True
IMAGE_DIR = "smiling_images"

In [2]:
import fsspec
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

fs = fsspec.filesystem("hf")
with fs.open("datasets/OpenFace-CQUPT/FaceCaption-15M/FaceCaption-v2.parquet") as f:
    pq_file = pq.ParquetFile(f)
    table = pq_file.read_row_group(0, columns=["caption", "url"])
    df = table.to_pandas()

# Filter nulls
df = df[df["caption"].notnull() & df["url"].notnull()]

# Keep only valid caption arrays
df = df[df["caption"].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x) > 0)]

# Extract and lowercase the first caption
df["caption_text"] = df["caption"].apply(lambda x: str(x[0]).strip().lower())

# Keep only the first URL
df["url"] = df["url"].apply(lambda x: x.split(",")[0].strip())

# Download and and filter out the data

In [None]:
from PIL import Image
from io import BytesIO
import requests
from IPython.display import display

def load_image_from_url(url):
    response = requests.get(url)
    response.raise_for_status()
    return Image.open(BytesIO(response.content)).convert("RGB")

# Random sample
sample_row = df.sample(1).iloc[0]
img = load_image_from_url(sample_row["url"])
caption = sample_row["caption_text"]

# Display
display(img)
print("Caption:", caption)

KeyboardInterrupt: 

# Run CLIP zero-shot classification

In [None]:
prompts = ["man", "woman"]

inputs = processor(
    text=prompts,
    images=img,
    return_tensors="pt",
    padding=True
).to(device)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits_per_image
    probs = logits.softmax(dim=1)

# Show prediction
pred = prompts[probs.argmax()]
confidence = probs.max().item()

print(f"Prediction: {pred}")
print(f"Confidence: {confidence:.2f}")

Prediction: woman
Confidence: 0.66
