In [None]:
# Qwen3-VL 8B Abliterated captioning pipeline
# - Model: prithivMLmods/Qwen3-VL-8B-Abliterated-Caption-it
# - Dataset: Flickr8k (clip-benchmark/wds_flickr8k variant with fields: jpg, txt)
# - Metrics: METEOR, ROUGE-L, BERTScore
# - Output CSV schema (Option A): image_id, abliterated_caption, gt_1..gt_5, meteor, rouge_l, bert_f1, bert_precision, bert_recall

# Install dependencies (uncomment in Colab)
# !pip install git+https://github.com/QwenLM/qwen-vl.git
!pip install transformers datasets accelerate torchvision sentencepiece rouge-score bert-score nltk tqdm pillow

import csv
import random
import time

from datasets import load_dataset
from PIL import Image
from tqdm.auto import tqdm

import torch
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score as bertscore_score

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
# from qwen_vl_utils import process_vision_info # --- having trouble with the git

import nltk
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

# imported code directly for qwen_vl_utils import process_vision_info
def process_vision_info(messages):
    """
    Extract images from Qwen messages format:
    [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": PIL.Image or np.array},
                ...
            ]
        }
    ]

    Returns (image_list, video_list)
    Video list will always be empty for this pipeline.
    """

    image_list = []
    video_list = []  # not used

    for msg in messages:
        if "content" not in msg:
            continue
        for item in msg["content"]:
            if item["type"] == "image":
                image_list.append(item["image"])
            # model also supports videos, we ignore them here

    return image_list, video_list

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0113c9bcb28a57e231802c9ac0fb7bd3a481d70bd7b79e5128e8ee8f93f12b3b
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2




In [None]:

# ================================================================
# QWEN MODEL CONFIG
# ================================================================
MODEL_NAME = "prithivMLmods/Qwen3-VL-8B-Abliterated-Caption-it"
NUM_IMAGES = 20         # you can set 100 or other; started at 20 for speed/safety
RANDOM_SEED = 42
OUTPUT_CSV = "qwen_flickr8k.csv"
API_DELAY = 3.0         # seconds between model calls (reduce if you're confident)
MAX_NEW_TOKENS = 128
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# ================================================================
# QWWEN MODEL LOAD
# ================================================================
print("Loading Qwen3-VL model…")

print(f"Loading model: {MODEL_NAME}")
print("Loading model...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "prithivMLmods/Qwen2-VL-2B-Abliterated-Caption-it", torch_dtype="auto", device_map="auto"
)

processor = AutoProcessor.from_pretrained("prithivMLmods/Qwen2-VL-2B-Abliterated-Caption-it")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading Qwen3-VL model…
Loading model: prithivMLmods/Qwen3-VL-8B-Abliterated-Caption-it
Loading model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

In [None]:

# ================================================================
# LOAD DATASET
# ================================================================

print("Loading Flickr8k dataset from Hugging Face (may take a moment)...")
dataset = load_dataset("clip-benchmark/wds_flickr8k")

# The dataset may have splits; use 'test' split if available, otherwise sample across 'train'
available_splits = list(dataset.keys())
print("Available splits:", available_splits)

# Prefer 'test' split if present
if "test" in dataset:
    split = "test"
elif "validation" in dataset:
    split = "validation"
else:
    split = "train"

# test if data is retrievable and check format
print(f"Using split: {split}")
print(dataset)
print(dataset[split][0])
print("Sample item keys:", dataset[split][0].keys())
dataset = dataset[split]

# ================================================================
# SAMPLE IMAGES
# ================================================================

random.seed(RANDOM_SEED)
indices = random.sample(range(len(dataset)), NUM_IMAGES)

rows = []
candidate_list = []
refs_list = []


Loading Flickr8k dataset from Hugging Face (may take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train/0.tar:   0%|          | 0.00/212M [00:00<?, ?B/s]

train/1.tar:   0%|          | 0.00/213M [00:00<?, ?B/s]

train/2.tar:   0%|          | 0.00/213M [00:00<?, ?B/s]

train/3.tar:   0%|          | 0.00/213M [00:00<?, ?B/s]

test/0.tar:   0%|          | 0.00/27.5M [00:00<?, ?B/s]

test/1.tar:   0%|          | 0.00/28.6M [00:00<?, ?B/s]

test/2.tar:   0%|          | 0.00/28.6M [00:00<?, ?B/s]

test/3.tar:   0%|          | 0.00/27.8M [00:00<?, ?B/s]

test/4.tar:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Available splits: ['train', 'test']
Using split: test
DatasetDict({
    train: Dataset({
        features: ['__key__', '__url__', 'jpg', 'txt'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['__key__', '__url__', 'jpg', 'txt'],
        num_rows: 1000
    })
})
{'__key__': 's0000000', '__url__': '/root/.cache/huggingface/hub/datasets--clip-benchmark--wds_flickr8k/snapshots/652f8f3b6030420c97cb1a0a0a11da3191f6ced2/test/0.tar', 'jpg': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x335 at 0x7F96909310D0>, 'txt': 'The dogs are in the snow in front of a fence .\nThe dogs play on the snow .\nTwo brown dogs playfully fight in the snow .\nTwo brown dogs wrestle in the snow .\nTwo dogs playing in the snow .'}
Sample item keys: dict_keys(['__key__', '__url__', 'jpg', 'txt'])


NameError: name 'RANDOM_SEED' is not defined

In [None]:
# ================================================================
# DATASET EXTRACTION FUNCTIONS
# ================================================================

def extract_image(example):
    """Extract the PIL image from the 'jpg' field."""
    return example["jpg"]

def extract_captions(example):
    """Split newline-separated captions into list of 5 clean strings."""
    raw = example["txt"]
    return [c.strip() for c in raw.split("\n") if c.strip()]

def extract_image_id(example, idx):
    """Use '__key__' field to identify images."""
    return example["__key__"] if "__key__" in example else f"image_{idx}"


# ================================================================
# QWEN CAPTION GENERATION
# ================================================================

def caption_with_qwen(pil_img):
    # Build Qwen-VL chat-style messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_img},
                {"type": "text",  "text": "Describe this image in one sentence."},
            ],
        }
    ]

    # Apply the Qwen chat template
    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Extract images (and videos, not used)
    image_inputs, video_inputs = process_vision_info(messages)

    # IMPORTANT FIX:
    # remove video from the processor or it will try to run video preprocessing
        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt",
        )

    # Move to device
    inputs = inputs.to(DEVICE)

    # Generate caption
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=64
        )

    # Trim input tokens to get only model output (same as official example)
    generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape[1]:]

    # Decode the caption
    caption = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    # fallback for empty model output
    if not caption:
        return "error"

    return caption


In [None]:
# extra sanity check if needed T_T
img = dataset[0]["jpg"]
print(img)
print(caption_with_qwen(img))


<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x335 at 0x7CDB4B3B39E0>




Two Doberman dogs are playfully fighting in a snowy field, with one dog on the left appearing to be in control and the other on the right appearing to be defending itself.


In [None]:
# ================================================================
# METRICS FUNCTIONS
# ================================================================

def compute_meteor(cand, refs):
    try:
        return meteor_score(refs, cand)
    except:
        return 0.0

def compute_rouge_l(cand, refs):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    return max(scorer.score(r, cand)["rougeL"].fmeasure for r in refs)

def compute_bertscore_batch(cands, refs):
    # flatten
    flat_cand, flat_refs, idx_map = [], [], []
    for i, (c, rs) in enumerate(zip(cands, refs)):
        for r in rs:
            flat_cand.append(c)
            flat_refs.append(r)
            idx_map.append(i)

    P, R, F = bertscore_score(flat_cand, flat_refs, lang="en", rescale_with_baseline=True)
    P, R, F = P.tolist(), R.tolist(), F.tolist()

    best_p = [0]*len(cands)
    best_r = [0]*len(cands)
    best_f = [0]*len(cands)

    for p, r, f, i in zip(P, R, F, idx_map):
        if f > best_f[i]:
            best_f[i] = f
            best_p[i] = p
            best_r[i] = r

    return best_f, best_p, best_r

In [None]:

# ================================================================
# MAIN LOOP
# ================================================================

print("Generating captions...")
for idx in tqdm(indices):
    ex = dataset[idx]
    img = ex["jpg"]
    refs = [r.strip() for r in ex["txt"].split("\n") if r.strip()]
    refs5 = refs[:5] + [""]*(5 - len(refs))

    caption = caption_with_qwen(img)
    print(caption)

    meteor = compute_meteor(caption, refs)
    rouge = compute_rouge_l(caption, refs)

    candidate_list.append(caption)
    refs_list.append(refs)

    rows.append({
        "image_id": ex.get("__key__", f"image_{idx}"),
        "abliterated_caption": caption,
        "gt_1": refs5[0],
        "gt_2": refs5[1],
        "gt_3": refs5[2],
        "gt_4": refs5[3],
        "gt_5": refs5[4],
        "meteor": meteor,
        "rouge_l": rouge,
        "bert_f1": None,
        "bert_precision": None,
        "bert_recall": None,
    })


Generating captions...


  0%|          | 0/20 [00:00<?, ?it/s]



A person in a brown outfit is running on a dirt path, creating a cloud of dust behind them. The setting appears to be a backyard with grass, trees, and a white building in the background.
A young girl is playing in a water park, surrounded by colorful circular water features. She appears to be enjoying the water splashes.
The image depicts a bustling street scene in an Asian city, with people walking along a narrow alleyway lined with shops and restaurants. The architecture features traditional Chinese elements, such as red lanterns and signage, adding to the vibrant atmosphere. The ground is paved with stone tiles, and there are various items for sale, including
The image shows two young children playing in a living room. One child is holding a toy hose and appears to be pretending to use it, while the other child is standing and seems to be playing with a toy frog. The room has a brown couch and a framed picture on the wall.
A person is sandboarding down a sandy hill, leaving a trail

In [None]:
# ================================================================
# BERTScore Batch Processing
# ================================================================

print("Computing BERTScore...")
f1, p, r = compute_bertscore_batch(candidate_list, refs_list)
for i in range(len(rows)):
    rows[i]["bert_f1"] = f1[i]
    rows[i]["bert_precision"] = p[i]
    rows[i]["bert_recall"] = r[i]

Computing BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# ================================================================
# SAVE CSV
# ================================================================

print("Saving CSV:", OUTPUT_CSV)

fieldnames = [
    "image_id","abliterated_caption",
    "gt_1","gt_2","gt_3","gt_4","gt_5",
    "meteor","rouge_l","bert_f1","bert_precision","bert_recall"
]

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print("DONE. Results saved to:", OUTPUT_CSV)

Saving CSV: qwen_flickr8k.csv
DONE. Results saved to: qwen_flickr8k.csv
