In [6]:
import torch
from PIL import Image
from transformers import (
    AutoModelForImageTextToText,
    AutoProcessor
)
import matplotlib.pyplot as plt
import glob
import random
from nuscenes.nuscenes import NuScenes

import numpy as np
import json
import yaml

model_id = "Ertugrul/Qwen2.5-VL-7B-Captioner-Relaxed"
DATA_ROOT = '/users/bangya/projects/vlm/nuscenes-data'
DATA_VER = 'v1.0-trainval'
nusc = NuScenes(version=DATA_VER, dataroot=DATA_ROOT, verbose=True)

# the model requires more than 16GB of VRAM, 
# if you don't have you can use bitsandbytes to quantize the model to 8bit or 4bit


Loading NuScenes tables for version v1.0-trainval...
23 category,
8 attribute,
4 visibility,
64386 instance,
12 sensor,
10200 calibrated_sensor,
2631083 ego_pose,
68 log,
850 scene,
34149 sample,
2631083 sample_data,
1166187 sample_annotation,
4 map,
Done loading in 22.551 seconds.
Reverse indexing ...
Done reverse indexing in 7.1 seconds.


In [7]:

model = AutoModelForImageTextToText.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch.bfloat16,
  attn_implementation="flash_attention_2", # Use "flash_attention_2" when running on Ampere or newer GPU or use "eager" for older GPUs
)

# you can change the min and max pixels to fit your needs to decrease compute cost to trade off quality
min_pixels = 256*28*28
max_pixels = 1280*28*28

processor = AutoProcessor.from_pretrained(model_id, max_pixels=max_pixels, min_pixels=min_pixels)

def generate_description(path, model, processor, prompt):
    system_message = "You are an expert image labeller."

    image_inputs = Image.open(path).convert("RGB")
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": image_inputs},
            ],
        },
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)
    
    # min_p and temperature are experemental parameters, you can change them to fit your needs
    generated_ids = model.generate(**inputs, max_new_tokens=512, min_p=0.1, do_sample=True, temperature=1.5)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
def validate(caption):
    bad_words = ["photo", "image", "picture"]
    for word in bad_words:
        if word in caption:
            print(f"bad word: {word}")
            return False
    return True

def captioning(img_path, verbose=False):
    instance_token = img_path.split("/")[-1].split(".")[0]
    nusc_instance = nusc.get('instance', instance_token)
    category_name = nusc.get('category', nusc_instance['category_token'])['name']
    hint = category_name.split(".")[-1]

    prompt = "Provide a short noun phrase captioning the single object in a street scene."
    "such as 'black sedan with red logo' or 'man in a blue t-shirt and jeans'."
    "Must illustrate object colors.  "
    "simply describe the object, do not judge on the image"
    f"(stick to hint {hint}). TEN words max."



    img = Image.open(img_path)
    if img.size[0] < 28 or img.size[1] < 28:
        print(f"image too small: {img_path}")
        return None

    if verbose:
        try:
            plt.close()
        except:
            pass
        print(f"hint: {hint}")
        plt.axis('off')
        plt.imshow(img)

    description = generate_description(img_path, model, processor, prompt)

    # grey = img.convert("L")
    # grey_pixels = np.array(grey)
    # avg_brightness = grey_pixels.mean()
    # darkness = 1 - (avg_brightness / 255)
    # print(f"darkness: {darkness:.2f}")

    # return(description)

    description = description.lower().strip()
    cap = description.replace("A ", "the ").replace("a ", "the ")
    cap = cap.split(".")[0]
    if not cap.startswith("the"):
        cap = "the " + cap

    if not validate(cap):
        print(f"invalid caption: {cap}")
        return None
    
    return cap



In [None]:
root = "./instance_crops"
dump_path = "./captions.yaml"
scenes = glob.glob(f"{root}/*")

with open(dump_path, "r") as f:
    history = yaml.safe_load(f)

for sc in scenes:
    images = glob.glob(f"{sc}/*")
    sc_name = sc.split("/")[-1]
    if sc_name in history:
        print(f"Already processed {sc_name}, skipping...")
        continue
    if len(images) == 0:
        print(f"No images found in the directory: {sc}")
        continue
    print(f"Processing {len(images)} images in {sc_name}...")
    sc_objs = {}
    for image_path in images:
        obj = image_path.split("/")[-1].split(".")[0]
        cap = captioning(image_path)
        if cap is None:
            print(f"Failed to caption image: {image_path}")
            continue
        sc_objs[obj] = cap

    with open(dump_path, "a") as f:
        yaml.dump({sc_name: sc_objs}, f, default_flow_style=False, width=float("inf"))



Already processed 8180a1dbbba3479bb0c7f4ff6e9a3f0e, skipping...
Already processed 9c83e438973e4824853e6a38928ca4ff, skipping...
Already processed c6da5920d2bc459bb3bc51ee6b9fdd7e, skipping...
Already processed f5b29a1e09d04355adcd60ab72de006b, skipping...
Already processed 16be583c31a2403caa6c158bb55ae616, skipping...
Already processed 01796494e25c448dadaba70cfcc3532d, skipping...
Already processed 73d9a36f20594e658020ebfc5b0ba74a, skipping...
Already processed a99120daccb24bcd941b33e6e03bf718, skipping...
Already processed 9c5dc664216e43a99d5da3f23d373e4d, skipping...
Already processed 8b60e7e47cde43e0a7fbd66d926de14f, skipping...
Already processed 63252988084b4943a50734990c47e039, skipping...
Already processed 3bc4553925494890a21ef7a15c40eaed, skipping...
Already processed 4431d97ea17044ad9c09c13d16684054, skipping...
Already processed 2c96ff6afc9e4cf7bc8b107fce955c1f, skipping...
Already processed 7bd098ac88cb4221addd19202a7ea5de, skipping...
Already processed bd850592cd2541288b177b



bad word: image
invalid caption: the blurry black-and-white street scene, with indistinct shapes and lines suggesting parked cars, buildings, or pedestrians, and the overall image is grainy, making specific details difficult to discern
Failed to caption image: ./instance_crops/fe4fdd7a28754baeac7074ad78f55a52/e455d1eee8fe4bad80dec39c49d2915a.jpg
Processing 33 images in 1d914f73a4a243c3acac50d24f083aac...
Processing 11 images in bc219c0fa63b43b4b9dddab47fce1fef...
Processing 6 images in 53e8446852bf488bb1b09ae032918bbd...
Processing 4 images in f12b4e7fb95646ec9dac305076024bb9...
Processing 19 images in b0b26c1e5a1140e69598422f12ae1dc0...
Processing 9 images in 91f797db8fb34ae5b32ba85eecae47c9...
Processing 14 images in cdeb0e85e43e481a87b2251351e10c2f...
bad word: image
invalid caption: the blurry image of the car at night, likely taken with motion blur due to the vehicle's movement, on the dimly lit street with indistinct background elements like buildings and streetlights
Failed to c

: 