In [None]:
# Imports
# === Gemini-only Flickr8k evaluation pipeline (100 images, CSV export)
# Make sure to set your Gemini API key before running.

import os
import csv
import random
import time
import torch
from typing import List, Dict

# Install required packages
!pip install datasets google-generativeai rouge-score bert-score nltk tqdm

import google.generativeai as genai
from datasets import load_dataset
from PIL import Image
from tqdm.auto import tqdm

# Metrics
import nltk
from rouge_score import rouge_scorer
from bert_score import score as bertscore_score

# Ensure required NLTK data (for METEOR)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

from nltk.translate.meteor_score import meteor_score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=34821bd51ac70a989093f18b0532684dd00798e66bf37df339743c5b68171e84
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [None]:
# ==============================================================
# Gemini pipeline Configuration
# ==============================================================
# -------------------------
# Configuration
# -------------------------
GEMINI_MODEL_NAME = "gemini-2.5-flash-lite"         # as requested
NUM_IMAGES = 20                                   # your choice
RANDOM_SEED = 42
OUTPUT_CSV = "gemini_flickr8k_results.csv"
API_DELAY = 90  # seconds between requests -- 10, 60 --> XX

genai.configure(api_key="AIzaSyBbapqyvzWWn3TbP4fpe4PAn5BElclzwvs")

gemini_model = genai.GenerativeModel(GEMINI_MODEL_NAME)

# # Simple test run (working)
# response = gemini_model.generate_content("Describe the purpose of image captioning in one sentence.")
# print(" Gemini 2.5 test response:\n", response.text)

def caption_with_gemini(image_pil):
    """
    Send image + prompt to Gemini, using Option A (.text).
    """
    try:
        response = gemini_model.generate_content(
            ["Describe this image in one sentence:", image_pil]
        )
        return (response.text or "").strip()
    except Exception as e:
        print(f"Gemini error: {e}")
        return ""

In [None]:
# ============================================================
#  METRICS | COMPUTATION FUNCTIONS
# ============================================================

def compute_meteor(candidate, references):
    try:
        return meteor_score(references, candidate)
    except:
        return 0.0

def compute_rouge_l(candidate, references):
    try:
        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
        scores = [scorer.score(ref, candidate)["rougeL"].fmeasure for ref in references]
        return max(scores)
    except:
        return 0.0

def compute_bertscore_batch(cands, refs_list):
    """
    Compute BERTScore for all images in batch.
    For each candidate, compare to every reference and keep the best match.
    """
    flat_cand = []
    flat_refs = []
    idx_map = []

    for i, (c, refs) in enumerate(zip(cands, refs_list)):
        for r in refs:
            flat_cand.append(c)
            flat_refs.append(r)
            idx_map.append(i)

    P, R, F = bertscore_score(flat_cand, flat_refs, lang="en", rescale_with_baseline=True)
    P, R, F = P.tolist(), R.tolist(), F.tolist()

    best_p = [0]*len(cands)
    best_r = [0]*len(cands)
    best_f = [0]*len(cands)

    for p, r, f, idx in zip(P, R, F, idx_map):
        if f > best_f[idx]:
            best_f[idx] = f
            best_p[idx] = p
            best_r[idx] = r

    return best_f, best_p, best_r

In [None]:
# -------------------------
# Load Flickr8k dataset (From Hugging Face)
# -------------------------

print("Loading Flickr8k dataset from Hugging Face (may take a moment)...")
dataset = load_dataset("clip-benchmark/wds_flickr8k")

# The dataset may have splits; use 'test' split if available, otherwise sample across 'train'
available_splits = list(dataset.keys())
print("Available splits:", available_splits)

# Prefer 'test' split if present
if "test" in dataset:
    split = "test"
elif "validation" in dataset:
    split = "validation"
else:
    split = "train"

# test if data is retrievable and check format
print(f"Using split: {split}")
print(dataset)
print(dataset[split][0])
print("Sample item keys:", dataset[split][0].keys())
dataset = dataset[split]


Loading Flickr8k dataset from Hugging Face (may take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train/0.tar:   0%|          | 0.00/212M [00:00<?, ?B/s]

train/1.tar:   0%|          | 0.00/213M [00:00<?, ?B/s]

train/2.tar:   0%|          | 0.00/213M [00:00<?, ?B/s]

train/3.tar:   0%|          | 0.00/213M [00:00<?, ?B/s]

test/0.tar:   0%|          | 0.00/27.5M [00:00<?, ?B/s]

test/1.tar:   0%|          | 0.00/28.6M [00:00<?, ?B/s]

test/2.tar:   0%|          | 0.00/28.6M [00:00<?, ?B/s]

test/3.tar:   0%|          | 0.00/27.8M [00:00<?, ?B/s]

test/4.tar:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Available splits: ['train', 'test']
Using split: test
DatasetDict({
    train: Dataset({
        features: ['__key__', '__url__', 'jpg', 'txt'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['__key__', '__url__', 'jpg', 'txt'],
        num_rows: 1000
    })
})
{'__key__': 's0000000', '__url__': '/root/.cache/huggingface/hub/datasets--clip-benchmark--wds_flickr8k/snapshots/652f8f3b6030420c97cb1a0a0a11da3191f6ced2/test/0.tar', 'jpg': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x335 at 0x79E00B317EC0>, 'txt': 'The dogs are in the snow in front of a fence .\nThe dogs play on the snow .\nTwo brown dogs playfully fight in the snow .\nTwo brown dogs wrestle in the snow .\nTwo dogs playing in the snow .'}
Sample item keys: dict_keys(['__key__', '__url__', 'jpg', 'txt'])


In [None]:
# ============================================================
#  EXTRACTION FUNCTIONS (for this dataset format)
# ============================================================

def extract_image(example):
    """Get the PIL image from the 'jpg' field."""
    return example["jpg"]

def extract_captions(example):
    """
    Extract the 5 ground-truth captions from 'txt',
    which contains multiple captions separated by newline characters.
    """
    raw = example["txt"]
    captions = [c.strip() for c in raw.split("\n") if c.strip()]
    return captions

def extract_image_id(example, idx):
    """
    Create a stable identifier using '__key__',
    or fall back to numeric index.
    """
    return example["__key__"] if "__key__" in example else f"image_{idx}"

In [None]:
# ============================================================
#  SAMPLE IMAGES
# ============================================================

random.seed(RANDOM_SEED)
indices = random.sample(range(len(dataset)), NUM_IMAGES)

rows = []
candidates = []
refs_collection = []


In [None]:
# ============================================================
#  MAIN LOOP — GENERATE CAPTIONS AND METRICS
# ============================================================

for idx in tqdm(indices, desc="Processing images"):
    example = dataset[idx]

    img_id = extract_image_id(example, idx)
    pil_img = extract_image(example)
    refs = extract_captions(example)

    # ensure exactly 5 references for CSV consistency
    refs_5 = refs + [""]*(5 - len(refs)) if len(refs) < 5 else refs[:5]

    # Gemini caption
    # =========================
    time.sleep(90)
    caption = caption_with_gemini(pil_img)
    print("Caption:", caption)
    # =========================


    # caption = ""
    # fail = True
    # count = 0
    # while fail:
    #     try:
    #         caption = caption_with_gemini(pil_img)
    #         if caption == "":
    #             raise Exception("Empty caption")
    #         fail = False
    #         print("Caption:", caption)
    #     except Exception as e:
    #         count = count + 1
    #         if count > 3:
    #             raise Exception("Too many retries... skipping.")

    #         print(f"Gemini error: {e}")
    #         print("Retrying...")
    #         time.sleep(API_DELAY)


    # METEOR + ROUGE
    meteor_val = compute_meteor(caption, refs)
    rouge_val = compute_rouge_l(caption, refs)

    # store data for later BERTScore batch
    candidates.append(caption)
    refs_collection.append(refs)

    rows.append({
        "image_id": img_id,
        "gemini_caption": caption,
        "gt_1": refs_5[0],
        "gt_2": refs_5[1],
        "gt_3": refs_5[2],
        "gt_4": refs_5[3],
        "gt_5": refs_5[4],
        "meteor": meteor_val,
        "rouge_l": rouge_val,
        "bert_f1": None,
        "bert_precision": None,
        "bert_recall": None,
    })


Processing images:   0%|          | 0/20 [00:00<?, ?it/s]

Caption: A person wearing a beanie and loose clothing kicks up a cloud of dust while running on a dirt path in front of a tree.
Caption: A young girl laughs as she plays in the spray of a water park, surrounded by colorful hoops.
Caption: A pedestrian street is lined with shops adorned with red lanterns and signs, with people walking along the paved walkway.
Caption: A toddler in a red plaid shirt joyfully holds a slinky while another child stands in pajamas behind him.
Caption: A young boy with blond hair slides down a sand dune on a board, kicking up a large cloud of sand, while another boy watches from further up the dune under a clear blue sky.
Caption: A fluffy, dark-colored dog is captured mid-air as it joyfully leaps along a dusty path.
Caption: A Basset Hound lies on the ground next to a yellow car, with its leash attached to the car's side mirror.
Caption: A child in a helmet rides a purple bicycle on a sunny day, casting a long shadow on the bright pavement.
Caption: A motocr

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash-lite:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1399.08ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash-lite:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1501.86ms


Caption: Several football players in red jerseys celebrate with officials on a sunny day in front of a crowd of spectators in red.




Gemini error: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash-lite
Please retry in 5.890014564s.
Caption: 


In [None]:
# ============================================================
#  BERTScore appending
# ============================================================

print("Computing BERTScore for all captions...")
bert_f, bert_p, bert_r = compute_bertscore_batch(candidates, refs_collection)


for i in range(len(rows)):
    rows[i]["bert_f1"] = bert_f[i]
    rows[i]["bert_precision"] = bert_p[i]
    rows[i]["bert_recall"] = bert_r[i]

Computing BERTScore for all captions...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print("Saving CSV:", OUTPUT_CSV)

fieldnames = [
    "image_id", "gemini_caption",
    "gt_1","gt_2","gt_3","gt_4","gt_5",
    "meteor","rouge_l","bert_f1","bert_precision","bert_recall"
]

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)


print("Done! Results saved to:", OUTPUT_CSV)

Saving CSV: gemini_flickr8k_results.csv
Done! Results saved to: gemini_flickr8k_results.csv
