In [6]:
from unsloth import FastVisionModel, FastLanguageModel
import os
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from transformers import AutoProcessor, TextStreamer
import nltk
from datasets import load_from_disk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-7B-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.4.7: Fast Qwen2_Vl patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.339 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [4]:
model_id = "unsloth/Qwen2-VL-7B-Instruct"

model = FastVisionModel.for_inference(model)
processor = AutoProcessor.from_pretrained(model_id)

preprocessor_config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [7]:
test_dataset = load_from_disk("/workspace/data/filtered_dataset")

In [8]:
print("Filtered test Dataset:")
print(test_dataset)

print(f"Loaded {len(test_dataset)} image-caption entries\n")

print("📌 First 5 entries:")
first_5 = test_dataset.select(range(5))
for row in first_5:
    print(row, "\n")

print("📋 Column types:")
print(test_dataset.features)

print("\n🔎 Size")
print((len(test_dataset), len(test_dataset.features)))

Filtered test Dataset:
Dataset({
    features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
    num_rows: 10
})
Loaded 10 image-caption entries

📌 First 5 entries:
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x375 at 0x7994E6DC26E0>, 'caption': ['Three Caucasian hikers in summer clothes walk a dirt path through a wooded meadow during the day.', 'Three people walking down a muddy dirt road on a cloudy day.', 'Three people are walking down a dirt road in the country.', 'Three people walking on a trail in the middle of the day.', 'Three people are walking down a dirt path.'], 'sentids': ['12500', '12501', '12502', '12503', '12504'], 'split': 'train', 'img_id': '2500', 'filename': '182169366.jpg'} 

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=356x500 at 0x7994E63DD7E0>, 'caption': ['A waitress in a black T-shirt and black shorts at Cafe Mambo serves a tray of beverages to a seated customer in a red tank top and black bandanna.'

In [9]:
from collections import Counter

filename_counts = Counter(test_dataset["filename"])

# Print filenames with more than 1 occurrence
for filename, count in filename_counts.items():
    if count > 1:
        print(f"{filename}: {count} occurrences")

num_duplicates = sum(1 for count in filename_counts.values() if count > 1)
print(f"\nTotal filenames with duplicates: {num_duplicates}")


Total filenames with duplicates: 0


In [14]:
import torch
import time
from transformers import TextStreamer
import pandas as pd

streamer = TextStreamer(tokenizer, skip_prompt=True)

def run_vlm_inference(prompt: str, filename: str, df: pd.DataFrame):
    """
    Perform inference on a given image (by filename) from the dataframe using a custom prompt.

    Args:
        prompt (str): The prompt text (can include mask tokens)
        filename (str): Filename of the image in the DataFrame
        df (pd.DataFrame): DataFrame with 'filename' and 'image' columns

    Returns:
        Tuple[str, float, float]: (Generated output, inference time in seconds, VRAM used in GB)
    """
    row = df[df["filename"] == filename]
    if row.empty:
        print(f"[ERROR] No image found with filename: {filename}")
        return None, 0.0, 0.0

    row = row.iloc[0]
    image = row["image"]
    if isinstance(image, dict) and "bytes" in image:
        from PIL import Image
        from io import BytesIO
        image = Image.open(BytesIO(image["bytes"])).convert("RGB")
    caption = row["caption"]

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": image}
            ]
        }
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to("cuda")

    torch.cuda.reset_peak_memory_stats()
    start_mem = torch.cuda.memory_allocated() / 1024 / 1024 / 1024  # in GB
    start_time = time.time()

    print(f"🔹 Image: {filename}")
    print(f"🧾 Prompt: {prompt}")
    print("📤 Output:")

    # Perform inference
    output_ids = model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=64,
        use_cache=True,
        temperature=1.5,
        min_p=0.1,
    )

    end_time = time.time()
    end_mem = torch.cuda.max_memory_allocated() / 1024**3

    time_taken = round(end_time - start_time, 3)
    vram_used = round(end_mem - start_mem, 3)
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"⏱️ Time taken: {time_taken} sec | 🧠 VRAM used: {vram_used} GB")
    print("-" * 80)

    return decoded_output, time_taken, vram_used

In [15]:
prompt_set_0 = ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a nature trail guide. Describe what’s happening in this scene to someone preparing for their first family hike.",
               "There is a <text_1> walking on a <text_2> in this image.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_1= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a food critic observing customer service at cafes. Describe the interaction happening in this scene.",
               "A <text_1> is standing beside a <text_2> who is sitting down at a table.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_2= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a weather reporter documenting children’s routines during rainy mornings. Describe what’s going on in this image.",
               "<text_1> are walking with <text_2> in the rain.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_3= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a journalist covering a small-town parade. Describe the role and setting of the clown in this festive scene.",
               "A <text_1> with bright makeup and colorful clothes stands near a <text_2> during an event.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_4= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a winter sports photographer recounting a joyful moment you captured. Describe the scene with vivid detail.",
               "Two <text_1> are sitting on a snowy slope with their arms <text_2>.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_5= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a market researcher documenting traditional grocery stores. Describe what the boy is doing and what the store looks like.",
               "A <text_1> is behind a counter filled with <text_2> in a small shop.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_6= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are an IT workplace culture analyst. Describe the work environment and team dynamic shown in this image.",
               "A <text_1> is using a laptop at a desk while two <text_2> are in the background.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_7= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a canine trainer assessing a working dog’s field behavior. Describe the dog’s posture and role based on the scene..",
               "A <text_1> is wearing a vest and holding a <text_2> in its mouth.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_8= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a cultural anthropologist documenting traditional street labor. Describe what the person is doing and how they are carrying the items.",
               "A <text_1> is walking while carrying <text_2> suspended from a pole.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
prompt_set_9= ["",
              "How many people are playing soccer in this scene?",
               "An image of...",
               "You are a museum docent giving a tour. Describe the activity of the woman in the context of the art gallery.",
               "A <text_1> is painting a replica of an artwork in front of <text_2>.",
               "Describe using format - Subject:  \_\_\_; Activity:  \_\_\_; Environment:  \_\_\_; Additional Notes: \_\_\_;"
               ]
PROMPT_LIST = [prompt_set_0, prompt_set_1, prompt_set_2, prompt_set_3, prompt_set_4, prompt_set_5, prompt_set_6, prompt_set_7, prompt_set_8, prompt_set_9]

In [16]:
test_img_ids = [2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509]
df = test_dataset.to_pandas()

all_outputs = {}
all_times = {}
all_vram = {}

# Loop through each image and its corresponding prompt set
for i, img_id in enumerate(test_img_ids):
    row = test_dataset.filter(lambda x: x["img_id"] == str(img_id))
    if len(row) == 0:
        print(f"[WARNING] No image found with img_id {img_id}")
        continue

    filename = row[0]["filename"]
    prompts = PROMPT_LIST[i]

    print(f"\n🔍 Running prompts for image: {filename} (img_id: {img_id})\n" + "-"*60)

    inference_outputs = []
    inference_times = []
    vram_usages = []

    for prompt in prompts:
        output, time_taken, vram_used = run_vlm_inference(prompt, filename, df=df)
        inference_outputs.append(output)
        inference_times.append(time_taken)
        vram_usages.append(vram_used)

    # Save to overall dicts
    all_outputs[img_id] = inference_outputs
    all_times[img_id] = inference_times
    all_vram[img_id] = vram_usages


🔍 Running prompts for image: 182169366.jpg (img_id: 2500)
------------------------------------------------------------
🔹 Image: 182169366.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|box_start|>(625,312),(735,691)<|box_end|><|im_end|>
⏱️ Time taken: 79.079 sec | 🧠 VRAM used: 0.053 GB
--------------------------------------------------------------------------------
🔹 Image: 182169366.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There are four people playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.584 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 182169366.jpg
🧾 Prompt: An image of...
📤 Output:
<|object

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 182184279.jpg (img_id: 2501)
------------------------------------------------------------
🔹 Image: 182184279.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|>Cafe Mambo 1994 Ibiza<|object_ref_end|><|box_start|>(56,399),(286,501)<|box_end|><|im_end|>
⏱️ Time taken: 1.274 sec | 🧠 VRAM used: 0.191 GB
--------------------------------------------------------------------------------
🔹 Image: 182184279.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There are two people playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.505 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 182184279.jpg
🧾 Prompt: An image of...
📤 Output:
<|object_ref_start|>An image of...<|object_ref_end|><|box_start|>(0,0),(996,996)<|box_end|><|im_end|>
⏱️ Time taken: 1.002 sec | 🧠 VRAM used: 0.192 GB
--------------------------------------------------------------------------------
🔹 Image: 182184279.j

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 182246705.jpg (img_id: 2502)
------------------------------------------------------------
🔹 Image: 182246705.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|> they are walking away from the house<|object_ref_end|><|box_start|>(1,0),(995,993)<|box_end|><|im_end|>
⏱️ Time taken: 1.016 sec | 🧠 VRAM used: 0.191 GB
--------------------------------------------------------------------------------
🔹 Image: 182246705.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There are two people playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.499 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 182246705.jpg
🧾 Prompt: An image of...
📤 Output:
<|object_ref_start|>An image of...<|object_ref_end|><|box_start|>(1,0),(995,993)<|box_end|><|im_end|>
⏱️ Time taken: 0.914 sec | 🧠 VRAM used: 0.192 GB
--------------------------------------------------------------------------------
🔹 Image

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 182396080.jpg (img_id: 2503)
------------------------------------------------------------
🔹 Image: 182396080.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|> a clown <|object_ref_end|><|box_start|>(179,15),(603,993)<|box_end|><|im_end|>
⏱️ Time taken: 1.146 sec | 🧠 VRAM used: 0.187 GB
--------------------------------------------------------------------------------
🔹 Image: 182396080.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There is no existence of people playing soccer in the image.<|im_end|>
⏱️ Time taken: 0.647 sec | 🧠 VRAM used: 0.189 GB
--------------------------------------------------------------------------------
🔹 Image: 182396080.jpg
🧾 Prompt: An image of...
📤 Output:
<|object_ref_start|> An image of...<|object_ref_end|><|box_start|>(1,0),(995,993)<|box_end|><|im_end|>
⏱️ Time taken: 1.045 sec | 🧠 VRAM used: 0.187 GB
--------------------------------------------------------------------------------
🔹 Image: 182396080.jp

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 182493240.jpg (img_id: 2504)
------------------------------------------------------------
🔹 Image: 182493240.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|box_start|>(455,687),(525,852)<|box_end|><|box_start|>(359,679),(443,858)<|box_end|><|im_end|>
⏱️ Time taken: 1.772 sec | 🧠 VRAM used: 0.191 GB
--------------------------------------------------------------------------------
🔹 Image: 182493240.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There are two people playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.496 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 182493240.jpg
🧾 Prompt: An image of...
📤 Output:
<|object_ref_start|> An image of...<|object_ref_end|><|box_start|>(1,2),(995,996)<|box_end|><|im_end|>
⏱️ Time taken: 0.9 sec |

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 182503906.jpg (img_id: 2505)
------------------------------------------------------------
🔹 Image: 182503906.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|> a man is standing in front of a counter<|object_ref_end|><|box_start|>(0,335),(125,996)<|box_end|><|im_end|>
⏱️ Time taken: 1.148 sec | 🧠 VRAM used: 0.191 GB
--------------------------------------------------------------------------------
🔹 Image: 182503906.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There is one person playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.495 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 182503906.jpg
🧾 Prompt: An image of...
📤 Output:
<|object_ref_start|> An image of...<|object_ref_end|><|box_start|>(1,0),(995,993)<|box_end|><|im_end|>
⏱️ Time taken: 0.907 sec | 🧠 VRAM used: 0.192 GB
--------------------------------------------------------------------------------
🔹 

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 1827287777.jpg (img_id: 2506)
------------------------------------------------------------
🔹 Image: 1827287777.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|box_start|>(145,203),(623,917)<|box_end|><|im_end|>
⏱️ Time taken: 1.432 sec | 🧠 VRAM used: 0.191 GB
--------------------------------------------------------------------------------
🔹 Image: 1827287777.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There are three people playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.507 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 1827287777.jpg
🧾 Prompt: An image 

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 1827560917.jpg (img_id: 2507)
------------------------------------------------------------
🔹 Image: 1827560917.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|box_start|>(0,133),(999,989)<|box_end|><|im_end|>
⏱️ Time taken: 1.743 sec | 🧠 VRAM used: 0.195 GB
--------------------------------------------------------------------------------
🔹 Image: 1827560917.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There is one person playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.576 sec | 🧠 VRAM used: 0.197 GB
--------------------------------------------------------------------------------
🔹 Image:

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 182825639.jpg (img_id: 2508)
------------------------------------------------------------
🔹 Image: 182825639.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|> this is a person<|object_ref_end|><|box_start|>(429,58),(713,951)<|box_end|><|im_end|>
⏱️ Time taken: 0.996 sec | 🧠 VRAM used: 0.191 GB
--------------------------------------------------------------------------------
🔹 Image: 182825639.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There is one person playing soccer in this scene.<|im_end|>
⏱️ Time taken: 0.494 sec | 🧠 VRAM used: 0.193 GB
--------------------------------------------------------------------------------
🔹 Image: 182825639.jpg
🧾 Prompt: An image of...
📤 Output:
<|object_ref_start|> An image of...<|object_ref_end|><|box_start|>(1,0),(995,993)<|box_end|><|im_end|>
⏱️ Time taken: 0.9 sec | 🧠 VRAM used: 0.192 GB
--------------------------------------------------------------------------------
🔹 Image: 182825639.jpg
🧾 P

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]


🔍 Running prompts for image: 18284727.jpg (img_id: 2509)
------------------------------------------------------------
🔹 Image: 18284727.jpg
🧾 Prompt: 
📤 Output:
<|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|object_ref_start|><|box_start|>(475,417),(741,987)<|box_end|><|im_end|>
⏱️ Time taken: 1.359 sec | 🧠 VRAM used: 0.187 GB
--------------------------------------------------------------------------------
🔹 Image: 18284727.jpg
🧾 Prompt: How many people are playing soccer in this scene?
📤 Output:
There is no existence of people playing soccer in the image.<|im_end|>
⏱️ Time taken: 0.555 sec | 🧠 VRAM used: 0.189 GB
--------------------------------------------------------------------------------
🔹 Image: 18284727.jpg
🧾 Prompt: An image of...
📤 Output:
<|ob

In [17]:
for img_id in all_outputs:
    print(f"\n🖼️ Image ID: {img_id}")
    print("-" * 60)
    for i, (output, time_taken, vram_used) in enumerate(zip(all_outputs[img_id], all_times[img_id], all_vram[img_id])):
        print(f"🔹 Prompt {i+1}")
        print(f"📝 Output: {output}")
        print(f"⏱️ Inference Time: {time_taken:.3f} sec")
        print(f"💾 VRAM Used: {vram_used:.3f} GB")
        print("-" * 60)


🖼️ Image ID: 2500
------------------------------------------------------------
🔹 Prompt 1
📝 Output: system
You are a helpful assistant.
user

assistant
(625,312),(735,691)
⏱️ Inference Time: 79.079 sec
💾 VRAM Used: 0.053 GB
------------------------------------------------------------
🔹 Prompt 2
📝 Output: system
You are a helpful assistant.
user
How many people are playing soccer in this scene?
assistant
There are four people playing soccer in this scene.
⏱️ Inference Time: 0.584 sec
💾 VRAM Used: 0.193 GB
------------------------------------------------------------
🔹 Prompt 3
📝 Output: system
You are a helpful assistant.
user
An image of...
assistant
 An image of...(1,2),(995,996)
⏱️ Inference Time: 1.066 sec
💾 VRAM Used: 0.192 GB
------------------------------------------------------------
🔹 Prompt 4
📝 Output: system
You are a helpful assistant.
user
You are a nature trail guide. Describe what’s happening in this scene to someone preparing for their first family hike.
assistant
The im