In [4]:
import torch
from torchvision import datasets, transforms
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os, json
from tqdm import tqdm  # Optional progress bar

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Load CIFAR-10 dataset (raw images, not transformed yet)
cifar10 = datasets.CIFAR10(root="data/", train=True, download=True)

# Step 2: Load BLIP captioning model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", use_fast=False)
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Step 3: Generate captions
output = {}

for idx in tqdm(range(100)):  # Try small batch first
    image, label = cifar10[idx]

    # 🔁 Resize and convert image to match BLIP expectations
    image = image.resize((224, 224)).convert("RGB")  # PIL image

    # ⚠️ Important: do NOT call .to(device) on a dict directly
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move each tensor to GPU

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    output[f"cifar10_{idx}.png"] = {
        "label": cifar10.classes[label],
        "caption": caption
    }

# Step 4: Save captions
with open("cifar10_captions.json", "w") as f:
    json.dump(output, f, indent=2)

print("✅ Done! Captions saved to cifar10_captions.json")


100%|██████████| 170M/170M [00:05<00:00, 33.8MB/s]
100%|██████████| 100/100 [00:38<00:00,  2.59it/s]

✅ Done! Captions saved to cifar10_captions.json





In [5]:
with open("cifar10_captions.json") as f:
    captions = json.load(f)

for k, v in list(captions.items())[:5]:
    print(f"{k} → label: {v['label']} | caption: {v['caption']}")


cifar10_0.png → label: frog | caption: a plate of food with a lot of food on it
cifar10_1.png → label: truck | caption: a large truck driving down a road
cifar10_2.png → label: truck | caption: a tractor is parked on the road
cifar10_3.png → label: deer | caption: a man in a suit and tie standing in the dark
cifar10_4.png → label: automobile | caption: a car is parked on the side of the road
