In [None]:
import os
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel
from transformers import AutoProcessor, TextStreamer
import nltk

nltk.download("punkt")

In [None]:
def create_image_caption_dataset(
    image_folder: str,
    captions_json: str,
    caption_strategy: str = 'first'
) -> pd.DataFrame:
    with open(captions_json, 'r') as f:
        captions_data = json.load(f)

    data = []
    for filename, caption_list in captions_data.items():
        image_path = os.path.join(image_folder, filename)
        if not os.path.exists(image_path):
            continue
        try:
            image = Image.open(image_path).convert("RGB")
            caption = caption_list if caption_strategy == 'first' else random.choice(caption_list)
            data.append({"image": image, "caption": caption, "filename": filename})
        except Exception as e:
            print(f"[ERROR] Could not load {filename}: {e}")
    return pd.DataFrame(data)

In [None]:
model_id = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"  # change if needed
#unsloth/Pixtral-12B-2409-bnb-4bit

model, tokenizer, _, _ = FastLanguageModel.from_pretrained(
    model_name = model_id,
    dtype = torch.float16,
    load_in_4bit = True,
    device_map = "auto"
)

model = FastLanguageModel.for_inference(model)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
image_folder = "/workspace/data/filtered_images"
captions_json = "/workspace/data/merged_output.json"

df = create_image_caption_dataset(image_folder, captions_json)
print(f"Loaded {len(df)} image-caption pairs")

In [None]:
print("📌 First 5 entries:")
print(df.head(), "\n")

print("📋 Column types:")
print(df.dtypes)

print("\n🔎 Size")
print(df.shape)

In [None]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)

def run_vlm_inference(prompt: str, image_index: int, df: pd.DataFrame):
    """
    Perform inference on a given image from the dataframe using a custom prompt.

    Args:
        prompt (str): The text prompt to use (can include mask tokens like <text_1>)
        image_index (int): The index of the image in the dataframe
        df (pd.DataFrame): DataFrame returned by create_image_caption_dataset
    """
    if image_index >= len(df):
        print("[ERROR] Image index out of bounds.")
        return

    row = df.iloc[image_index]
    image = row["image"]
    filename = row["filename"]

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": image}
            ]
        }
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to("cuda")

    print(f"🔹 Image: {filename}")
    print(f"🧾 Prompt: {prompt}")
    print("📤 Output:")

    _ = model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.0,
        top_p=0.95
    )

    print("-" * 80)


In [None]:
# Prompt 1: No Prompt (Image only, no textual instruction provided)
prompt_1 = ""
output_1 = run_vlm_inference(prompt_1, image_index=0, df=df)

# Prompt 2: Noisy Prompt
prompt_2 = "Describe &&damage 12 sedan drive’ this !!image."
output_2 = run_vlm_inference(prompt_2, image_index=0, df=df)

# Prompt 3: Hand-Crafted ("An image of...")
prompt_3 = "An image of a damaged car parked on the side of the road."
output_3 = run_vlm_inference(prompt_3, image_index=0, df=df)

# Prompt 4: Descriptive Prompt with Roleplay / Stylistic Instruction
prompt_4 = (
    "You are an insurance claims assessor. Provide a detailed description of the car’s condition."
)
output_4 = run_vlm_inference(prompt_4, image_index=0, df=df)

# Prompt 5: Masked Prompt
prompt_5 = (
    "This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. "
    "Additional notes: <text_1>."
)
output_5 = run_vlm_inference(prompt_5, image_index=0, df=df)

# Prompt 6: Format-Guided with Sample Answer Structure
prompt_6 = (
    "Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___"
)
output_6 = run_vlm_inference(prompt_6, image_index=0, df=df)


inference_outputs = [output_1, output_2, output_3, output_4, output_5, output_6]
ground_truths = df.iloc[0]['caption']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_cosine_similarity(pred, ref):
    embeddings = sbert_model.encode([pred, ref])
    return float(cosine_similarity([embeddings[0]], [embeddings[1]])[0][0])

In [None]:
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score

def compute_meteor(pred, ref):
    return float(meteor_score([word_tokenize(ref)], word_tokenize(pred)))

In [None]:
from pycocoevalcap.cider.cider import Cider
import os
import tempfile
import json

def compute_cider(pred, ref):
    pred_dict = {"0": [pred]}
    ref_dict = {"0": [ref]}

    # write to temp files
    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as pred_file, \
         tempfile.NamedTemporaryFile(mode="w+", delete=False) as ref_file:

        json.dump(pred_dict, pred_file)
        json.dump(ref_dict, ref_file)
        pred_file.flush()
        ref_file.flush()

        scorer = Cider()
        score, _ = scorer.compute_score(ref_dict, pred_dict)

    os.remove(pred_file.name)
    os.remove(ref_file.name)
    return float(score)

In [None]:
from nltk import word_tokenize, pos_tag

def compute_spice_like(pred, ref):
    pred_nouns = {word for word, pos in pos_tag(word_tokenize(pred)) if pos.startswith("NN")}
    ref_nouns = {word for word, pos in pos_tag(word_tokenize(ref)) if pos.startswith("NN")}

    intersection = len(pred_nouns & ref_nouns)
    union = len(pred_nouns | ref_nouns) or 1
    return round(intersection / union, 3)

In [None]:
def evaluate_inference_metrics(pred, ref):
    return {
        "cosine_similarity": compute_cosine_similarity(pred, ref),
        "meteor": compute_meteor(pred, ref),
        "cider": compute_cider(pred, ref),
        "spice_proxy": compute_spice_like(pred, ref),
    }

In [None]:
print(ground_truths)

In [None]:
all_scores = []
for i, gen in enumerate(inference_outputs):
    print(f"\nPrompt {i+1} Output:\n{gen}\nReference:\n{ground_truths}")
    scores = evaluate_inference_metrics(gen, ground_truths)
    all_scores.append(scores)

import pandas as pd
score_df = pd.DataFrame(all_scores, index=[f"Prompt {i+1}" for i in range(6)])
score_df