In [1]:
# Colab cell 1
!pip install -q transformers datasets accelerate ftfy sentencepiece huggingface_hub pillow
# faiss-cpu is optional; if install fails, we'll fallback to numpy/Sklearn
!pip install -q faiss-cpu || true
!pip install -q kaggle


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')
image_folder = "/content/drive/MyDrive/Datasets/Flickr8k/Flickr8k_images"
captions_file = "/content/drive/MyDrive/Datasets/Flickr8k/captions.txt"


Mounted at /content/drive


In [3]:
# Colab cell 3
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch, os
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# load model + processor (Salesforce BLIP)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# helper to generate one caption
def generate_caption(img_path, max_length=30):
    img = Image.open(img_path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt").to(device)
    out = model.generate(**inputs, max_length=max_length, num_beams=3)
    return processor.decode(out[0], skip_special_tokens=True)

# test on a single image (adjust path to your images)
sample_img = os.path.join(image_folder, "667626_18933d713e.jpg")  # change if needed
print("Caption:", generate_caption(sample_img))

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Caption: a woman laying on a surf board in the ocean


In [4]:
# Colab cell - Generate captions for ALL images in Flickr8k using BLIP

import os, pandas as pd

# Path to your Google Drive dataset
image_folder = "/content/drive/MyDrive/Datasets/Flickr8k/Flickr8k_images"

# Collect all image filenames
filenames = sorted(os.listdir(image_folder))[:1000]  # all ~8000 JPGs

results = []
for idx, fname in enumerate(filenames):
    path = os.path.join(image_folder, fname)
    try:
        cap = generate_caption(path)  # BLIP captioning function from earlier
    except Exception as e:
        cap = f"<error: {e}>"
    results.append({"image": fname, "caption": cap})

    # progress update every 100 images
    if idx % 100 == 0:
        print(f"Processed {idx}/{len(filenames)} images...")

# Save all captions to CSV
df = pd.DataFrame(results)
df.to_csv("/content/drive/MyDrive/Datasets/Flickr8k/generated_captions.csv", index=False)

print("✅ Saved captions to Google Drive: generated_captions.csv")
df.head(6)


Processed 0/1000 images...
Processed 100/1000 images...
Processed 200/1000 images...
Processed 300/1000 images...
Processed 400/1000 images...
Processed 500/1000 images...
Processed 600/1000 images...
Processed 700/1000 images...
Processed 800/1000 images...
Processed 900/1000 images...
✅ Saved captions to Google Drive: generated_captions.csv


Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a little girl in a pink dress
1,1001773457_577c3a7d70.jpg,two dogs playing on the side of the road
2,1002674143_1b742ab4b8.jpg,a child sitting in the grass
3,1003163366_44323f5815.jpg,a woman laying on a bench next to a river
4,1007129816_e794419615.jpg,a man wearing a hat
5,1007320043_627395c3d8.jpg,a little girl climbing on a red rope


In [5]:
# Colab cell 5 - make CLIP embeddings (images)
from transformers import CLIPProcessor, CLIPModel
import numpy as np

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_emb(img_path):
    img = Image.open(img_path).convert("RGB")
    inputs = clip_processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs)    # (1, d)
    emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.cpu().numpy()

# compute embeddings (save to .npy)
img_embs = []
files_small = sorted(os.listdir(image_folder))[:1000]

for fname in files_small:
    emb = get_image_emb(os.path.join(image_folder, fname))
    img_embs.append(emb[0])
img_embs = np.vstack(img_embs)   # shape (N, d)
np.save("img_embs.npy", img_embs)
print("Saved img_embs.npy (shape {})".format(img_embs.shape))


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Saved img_embs.npy (shape (1000, 512))


In [6]:
# Colab cell 6 - search by text
def get_text_emb(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        emb = clip_model.get_text_features(**inputs)
    emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.cpu().numpy()

# load saved emb & filenames
img_embs = np.load("img_embs.npy")
import numpy as np

def text_search(query, k=5):
    qemb = get_text_emb(query)   # (1, d)
    scores = img_embs @ qemb.T   # (N,1)
    scores = scores.squeeze()
    topk = scores.argsort()[::-1][:k]
    return [(files_small[i], float(scores[i])) for i in topk]

print(text_search("two dogs playing on the side of the road", k=1))


[('1405221276_21634dcd58.jpg', 0.3139735460281372)]


In [7]:
# Colab cell 7 - image-to-image search
def image_search(query_image_path, k=5):
    qemb = get_image_emb(query_image_path)  # (1,d)
    scores = img_embs @ qemb.T
    scores = scores.squeeze()
    topk = scores.argsort()[::-1][:k]
    return [(files_small[i], float(scores[i])) for i in topk]

print(image_search(sample_img, k=5))

[('1118557877_736f339752.jpg', 0.7934293746948242), ('1287475186_2dee85f1a5.jpg', 0.7770792245864868), ('1299459550_1fd5594fa2.jpg', 0.7751289010047913), ('1095980313_3c94799968.jpg', 0.7690777778625488), ('1096165011_cc5eb16aa6.jpg', 0.7551733255386353)]
