In [1]:
import os
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel
from transformers import AutoProcessor, TextStreamer
import nltk

nltk.download("punkt")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Standard import failed for UnslothDDPOTrainer: No module named 'UnslothDDPOTrainer'. Using tempfile instead!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def create_image_caption_dataset(
    image_folder: str,
    captions_json: str,
    caption_strategy: str = 'first'
) -> pd.DataFrame:
    with open(captions_json, 'r') as f:
        captions_data = json.load(f)

    data = []
    for filename, caption_list in captions_data.items():
        image_path = os.path.join(image_folder, filename)
        if not os.path.exists(image_path):
            continue
        try:
            image = Image.open(image_path).convert("RGB")
            caption = caption_list if caption_strategy == 'first' else random.choice(caption_list)
            data.append({"image": image, "caption": caption, "filename": filename})
        except Exception as e:
            print(f"[ERROR] Could not load {filename}: {e}")
    return pd.DataFrame(data)

In [6]:
model_id = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"  # change if needed
#unsloth/Pixtral-12B-2409-bnb-4bit

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    dtype = torch.float16,
    load_in_4bit = True,
    device_map = "auto"
)

model = FastLanguageModel.for_inference(model)
processor = AutoProcessor.from_pretrained(model_id)

==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [7]:
image_folder = "/workspace/data/filtered_images"
captions_json = "/workspace/data/merged_output.json"

df = create_image_caption_dataset(image_folder, captions_json)
print(f"Loaded {len(df)} image-caption pairs")

Loaded 451 image-caption pairs


In [8]:
print("📌 First 5 entries:")
print(df.head(), "\n")

print("📋 Column types:")
print(df.dtypes)

print("\n🔎 Size")
print(df.shape)

📌 First 5 entries:
                                               image  \
0  <PIL.Image.Image image mode=RGB size=1000x750 ...   
1  <PIL.Image.Image image mode=RGB size=1000x667 ...   
2  <PIL.Image.Image image mode=RGB size=1000x667 ...   
3  <PIL.Image.Image image mode=RGB size=1000x667 ...   
4  <PIL.Image.Image image mode=RGB size=1000x667 ...   

                                             caption    filename  
0  The vehicle exhibits significant damage, prima...  000001.jpg  
1  The vehicle exhibits significant damage. The f...  000002.jpg  
2  The vehicle exhibits significant damage includ...  000003.jpg  
3  The vehicle exhibits significant damage to one...  000004.jpg  
4  The vehicle exhibits significant damage with a...  000005.jpg   

📋 Column types:
image       object
caption     object
filename    object
dtype: object

🔎 Size
(451, 3)


In [31]:
def run_vlm_inference(prompt: str, image_index: int, df: pd.DataFrame) -> str:
    """
    Perform inference on a given image from the dataframe using a custom prompt.

    Args:
        prompt (str): The text prompt to use (can include mask tokens like <text_1>)
        image_index (int): The index of the image in the dataframe
        df (pd.DataFrame): DataFrame returned by create_image_caption_dataset

    Returns:
        str: The model's generated response
    """
    if image_index >= len(df):
        print("[ERROR] Image index out of bounds.")
        return None

    row = df.iloc[image_index]
    image = row["image"]
    filename = row["filename"]

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": image}
            ]
        }
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to("cuda")

    print(f"🔹 Image: {filename}")
    print(f"🧾 Prompt: {prompt}")

    generated_ids = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.0,
        top_p=0.95
    )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # Only return the part generated after the prompt
    response = generated_text.replace(input_text, "").strip()
    print(f"📤 Output: {response}")
    print("-" * 80)

    return response

In [32]:
# Prompt 1: No Prompt (Image only, no textual instruction provided)
prompt_1 = ""
output_1 = run_vlm_inference(prompt_1, image_index=0, df=df)

# Prompt 2: Noisy Prompt
prompt_2 = "Describe &&damage 12 sedan drive’ this !!image."
output_2 = run_vlm_inference(prompt_2, image_index=0, df=df)

# Prompt 3: Hand-Crafted ("An image of...")
prompt_3 = "An image of a damaged car parked on the side of the road."
output_3 = run_vlm_inference(prompt_3, image_index=0, df=df)

# Prompt 4: Descriptive Prompt with Roleplay / Stylistic Instruction
prompt_4 = (
    "You are an insurance claims assessor. Provide a detailed description of the car’s condition."
)
output_4 = run_vlm_inference(prompt_4, image_index=0, df=df)

# Prompt 5: Masked Prompt
prompt_5 = (
    "This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. "
    "Additional notes: <text_1>."
)
output_5 = run_vlm_inference(prompt_5, image_index=0, df=df)

# Prompt 6: Format-Guided with Sample Answer Structure
prompt_6 = (
    "Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___"
)
output_6 = run_vlm_inference(prompt_6, image_index=0, df=df)


inference_outputs = [output_1, output_2, output_3, output_4, output_5, output_6]
ground_truths = df.iloc[0]['caption']

🔹 Image: 000001.jpg
🧾 Prompt: 
📤 Output: system
You are a helpful assistant.
user

assistant
The image shows a close-up of a red car with a flat tire. The tire appears to be completely deflated, and the rim is visible with some rust on the bolts. The car's fender is also damaged, with a noticeable dent or tear near the tire area. This could be due to an impact or a collision. It's important to replace the flat tire as soon as possible to avoid further damage to the vehicle and ensure safe driving conditions.
--------------------------------------------------------------------------------
🔹 Image: 000001.jpg
🧾 Prompt: Describe &&damage 12 sedan drive’ this !!image.
📤 Output: system
You are a helpful assistant.
user
Describe &&damage 12 sedan drive’ this !!image.
assistant
The image shows a red sedan with significant damage to its front left side, specifically the wheel well area. The tire appears to be in poor condition, with visible signs of wear and tear, including cracks and uneven t

In [11]:
from sentence_transformers import SentenceTransformer, util
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
import pandas as pd

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
def compute_cosine_similarity(reference_captions, generated_caption):
    try:
        total_score = 0.0
        for caption in reference_captions:
            ref_embed = sbert_model.encode(caption, convert_to_tensor=True)
            gen_embed = sbert_model.encode(generated_caption, convert_to_tensor=True)
            score = util.cos_sim(gen_embed, ref_embed).item()
            total_score += score
        avg_score = total_score / len(reference_captions) if reference_captions else 0.0
        return avg_score
    except Exception as e:
        print(f"Error computing cosine similarity: {e}")
        return 0.0

In [None]:
def compute_cider_spice_scores(reference_caption, generated_caption):
    refs = {0: [reference_caption if reference_caption else ""]}
    hypos = {0: [generated_caption if generated_caption else ""]}

    # print(f"Generated caption: {generated_caption}")
    # print(f"Generated hypos: {hypos}")

    ptb = PTBTokenizer()
    refs_tok = ptb.tokenize({i: [{"caption": c} for c in caps] for i, caps in refs.items()})
    hypos_tok = ptb.tokenize({i: [{"caption": hypos[i][0]}] for i in hypos})

    all_scores = {}

    for scorer, name in [(Cider(), "CIDEr"), (Spice(), "SPICE")]:
        try:
            avg_score, _ = scorer.compute_score(refs_tok, hypos_tok)
            if name == "SPICE":
                # SPICE returns dicts per image
                all_scores[name] = avg_score.get("All", {}).get("f", 0.0) if isinstance(avg_score, dict) else avg_score
            else:
                all_scores[name] = avg_score
        except Exception as e:
            print(f"[ERROR] {name} scoring failed: {e}")
            all_scores[name] = 0.0

    return all_scores


In [25]:
def evaluate_all_metrics(reference_caption, generated_caption):
    cider_spice_scores = compute_cider_spice_scores(reference_caption, generated_caption)
    cosine_sim = compute_cosine_similarity([reference_caption], generated_caption)

    return {
        "cosine_similarity": round(cosine_sim, 4),
        "CIDEr": round(cider_spice_scores.get("CIDEr", 0.0), 4),
        "SPICE": round(cider_spice_scores.get("SPICE", 0.0), 4)
    }

In [33]:
print(ground_truths)
print(inference_outputs)

The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall and appears to be flat. The wheel itself shows signs of rust and corrosion. The car's body panel adjacent to the wheel has substantial rust and corrosion, with areas of missing paint and exposed metal, suggesting long-term neglect or damage.
["system\nYou are a helpful assistant.\nuser\n\nassistant\nThe image shows a close-up of a red car with a flat tire. The tire appears to be completely deflated, and the rim is visible with some rust on the bolts. The car's fender is also damaged, with a noticeable dent or tear near the tire area. This could be due to an impact or a collision. It's important to replace the flat tire as soon as possible to avoid further damage to the vehicle and ensure safe driving conditions.", "system\nYou are a helpful assistant.\nuser\nDescribe &&damage 12 sedan drive’ this !!image.\nassist

In [37]:
all_scores = []

for i, gen in enumerate(inference_outputs):
    print(f"\nPrompt {i+1} Output:\n{gen}\nReference:\n{ground_truths}")
    scores = evaluate_all_metrics(ground_truths, gen)
    all_scores.append(scores)


Prompt 1 Output:
system
You are a helpful assistant.
user

assistant
The image shows a close-up of a red car with a flat tire. The tire appears to be completely deflated, and the rim is visible with some rust on the bolts. The car's fender is also damaged, with a noticeable dent or tear near the tire area. This could be due to an impact or a collision. It's important to replace the flat tire as soon as possible to avoid further damage to the vehicle and ensure safe driving conditions.
Reference:
The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall and appears to be flat. The wheel itself shows signs of rust and corrosion. The car's body panel adjacent to the wheel has substantial rust and corrosion, with areas of missing paint and exposed metal, suggesting long-term neglect or damage.
Generated caption: system
You are a helpful assistant.
user

assistant
The image

PTBTokenizer tokenized 72 tokens at 1811.49 tokens per second.
PTBTokenizer tokenized 98 tokens at 2691.02 tokens per second.
Parsing reference captions
Parsing test captions


SPICE evaluation took: 6.614 s

Prompt 2 Output:
system
You are a helpful assistant.
user
Describe &&damage 12 sedan drive’ this !!image.
assistant
The image shows a red sedan with significant damage to its front left side, specifically the wheel well area. The tire appears to be in poor condition, with visible signs of wear and tear, including cracks and uneven tread patterns. The rim is also showing signs of rust and wear, particularly around the center cap where the bolts are located. The paint on the car's body is chipped and peeling in several areas, indicating that the vehicle has been exposed to the elements for some time. The overall condition suggests that the car may have been involved in an accident or has been neglected for a while.
Reference:
The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall and appears to be flat. The wheel itself shows signs of ru

PTBTokenizer tokenized 72 tokens at 2164.89 tokens per second.
PTBTokenizer tokenized 139 tokens at 3131.23 tokens per second.
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser

SPICE evaluation took: 12.93 s

Prompt 3 Output:
system
You are a helpful assistant.
user
An image of a damaged car parked on the side of the road.
assistant
The image shows a red car with significant damage to its front fender and tire. The tire appears to be flat, and there is visible wear or damage to the wheel and surrounding area of the fender. The paint on the car's body is chipped in several places, indicating that the vehicle has been involved in an accident or has sustained some form of impact.

If you encounter such a situation, it's important to ensure your safety first by moving to a safe location if possible, and then contacting local authorities or emergency services if necessary. Additionally, report the incident to your insurance company as soon as possible so they can assess the damage
Reference:
The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall

PTBTokenizer tokenized 72 tokens at 2190.33 tokens per second.
PTBTokenizer tokenized 148 tokens at 4510.88 tokens per second.
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser

SPICE evaluation took: 21.20 s

Prompt 4 Output:
system
You are a helpful assistant.
user
You are an insurance claims assessor. Provide a detailed description of the car’s condition.
assistant
The image shows a red car with significant damage to its front fender and tire area. Here is a detailed assessment of the condition:

1. **Fender Damage**:
   - The front fender appears to be severely damaged, with visible cracks and dents.
   - There is a noticeable gap where the fender has been pushed inward, exposing the inner structure of the vehicle.
   - The paint on the fender is chipped in several areas, indicating that the impact was forceful enough to cause both physical damage and paint loss.

2. **Tire Condition**:
   - The tire is completely flat, as indicated
Reference:
The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall and appears to be flat. The wheel itself

PTBTokenizer tokenized 72 tokens at 2138.28 tokens per second.
PTBTokenizer tokenized 141 tokens at 3884.28 tokens per second.
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser

SPICE evaluation took: 16.43 s

Prompt 5 Output:
system
You are a helpful assistant.
user
This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. Additional notes: <text_1>.
assistant
The part of the car shown in the image is the wheel well, and it appears to have significant damage. The damage type seems to be a large dent or deformation in the fender area, which could be due to an impact or collision. The severity of the damage looks quite severe as the dent extends into the wheel well and affects the structural integrity of the fender. Additional notes include that the paint on the fender is chipped in several areas, indicating previous damage or wear. The wheel itself also shows signs of wear, with visible rust on the bolts and some wear on the tire tread.
Reference:
The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall and appears

PTBTokenizer tokenized 72 tokens at 2175.47 tokens per second.
PTBTokenizer tokenized 146 tokens at 3839.24 tokens per second.
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser

SPICE evaluation took: 18.53 s

Prompt 6 Output:
system
You are a helpful assistant.
user
Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___
assistant
Damage Type: Puncture; Affected Part: Tire; Severity: Moderate; Notes: The tire appears to have a significant puncture, likely causing it to be flat and unable to hold air. There is visible damage to the surrounding area of the wheel well, indicating that the puncture may have caused some internal damage as well.
Reference:
The vehicle exhibits significant damage, primarily focusing on the wheel and surrounding bodywork. The tire is heavily deteriorated with extensive cracking across the sidewall and appears to be flat. The wheel itself shows signs of rust and corrosion. The car's body panel adjacent to the wheel has substantial rust and corrosion, with areas of missing paint and exposed metal, suggesting long-term neglect or damage.
Generated caption: system
You are a helpful assistant.
user
Describe

PTBTokenizer tokenized 72 tokens at 2182.66 tokens per second.
PTBTokenizer tokenized 93 tokens at 2730.03 tokens per second.
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.

SPICE evaluation took: 9.417 s


In [38]:
score_df = pd.DataFrame(all_scores, index=[f"Prompt {i+1}" for i in range(len(inference_outputs))])
score_df

Unnamed: 0,cosine_similarity,CIDEr,SPICE
Prompt 1,0.5825,0.0,0.1867
Prompt 2,0.7303,0.0,0.2708
Prompt 3,0.5984,0.0,0.1782
Prompt 4,0.6295,0.0,0.1739
Prompt 5,0.8026,0.0,0.18
Prompt 6,0.5757,0.0,0.1127
