In [1]:
import os
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel
from transformers import AutoProcessor, TextStreamer
import nltk

nltk.download("punkt")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothCPOTrainer: No module named 'UnslothCPOTrainer'. Using tempfile instead!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def create_image_caption_dataset(
    image_folder: str,
    captions_json: str,
    caption_strategy: str = 'first'
) -> pd.DataFrame:
    with open(captions_json, 'r') as f:
        captions_data = json.load(f)

    data = []
    for filename, caption_list in captions_data.items():
        image_path = os.path.join(image_folder, filename)
        if not os.path.exists(image_path):
            continue
        try:
            image = Image.open(image_path).convert("RGB")
            caption = caption_list if caption_strategy == 'first' else random.choice(caption_list)
            data.append({"image": image, "caption": caption, "filename": filename})
        except Exception as e:
            print(f"[ERROR] Could not load {filename}: {e}")
    return pd.DataFrame(data)

In [3]:
model_id = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"  # change if needed
#unsloth/Pixtral-12B-2409-bnb-4bit

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    dtype = torch.float16,
    load_in_4bit = True,
    device_map = "auto"
)

model = FastLanguageModel.for_inference(model)
processor = AutoProcessor.from_pretrained(model_id)

==((====))==  Unsloth 2025.4.7: Fast Qwen2_5_Vl patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [26]:
image_folder = "/workspace/data/test_dataset"
captions_json = "/workspace/data/test_set.json"

df = create_image_caption_dataset(image_folder, captions_json)
print(f"Loaded {len(df)} image-caption pairs")

Loaded 50 image-caption pairs


In [27]:
print("üìå First 5 entries:")
print(df.head(), "\n")

print("üìã Column types:")
print(df.dtypes)

print("\nüîé Size")
print(df.shape)

üìå First 5 entries:
                                               image  \
0  <PIL.Image.Image image mode=RGB size=1000x667 ...   
1  <PIL.Image.Image image mode=RGB size=1000x667 ...   
2  <PIL.Image.Image image mode=RGB size=1000x667 ...   
3  <PIL.Image.Image image mode=RGB size=1000x750 ...   
4  <PIL.Image.Image image mode=RGB size=1000x667 ...   

                                             caption    filename  
0  The car exhibits visible damage including a sc...  000481.jpg  
1  The car exhibits significant damage to the win...  000433.jpg  
2  The car exhibits a flat tire on the rear wheel...  000749.jpg  
3  The car exhibits significant damage. The rear ...  000541.jpg  
4  The car shows scratches along the side panel a...  000424.jpg   

üìã Column types:
image       object
caption     object
filename    object
dtype: object

üîé Size
(50, 3)


In [82]:
import torch
import time
from transformers import TextStreamer
import pandas as pd

streamer = TextStreamer(tokenizer, skip_prompt=True)

def run_vlm_inference(prompt: str, filename: str, df: pd.DataFrame):
    """
    Perform inference on a given image (by filename) from the dataframe using a custom prompt.

    Args:
        prompt (str): The prompt text (can include mask tokens)
        filename (str): Filename of the image in the DataFrame
        df (pd.DataFrame): DataFrame with 'filename' and 'image' columns

    Returns:
        Tuple[str, float, float]: (Generated output, inference time in seconds, VRAM used in GB)
    """
    row = df[df["filename"] == filename]
    if row.empty:
        print(f"[ERROR] No image found with filename: {filename}")
        return None, 0.0, 0.0

    row = row.iloc[0]
    image = row["image"]
    caption = row["caption"]

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": image}
            ]
        }
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to("cuda")

    torch.cuda.reset_peak_memory_stats()
    start_mem = torch.cuda.memory_allocated() / 1024 / 1024 / 1024  # in GB
    start_time = time.time()

    print(f"üîπ Image: {filename}")
    print(f"üßæ Prompt: {prompt}")
    print("üì§ Output:")
    
    # Perform inference
    output_ids = model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.0,
        top_p=0.95
    )

    end_time = time.time()
    end_mem = torch.cuda.max_memory_allocated() / 1024**3

    time_taken = round(end_time - start_time, 3)
    vram_used = round(end_mem - start_mem, 3)
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"‚è±Ô∏è Time taken: {time_taken} sec | üß† VRAM used: {vram_used} GB")
    print("-" * 80)

    return decoded_output, time_taken, vram_used

### Test Images

000404.jpg
000422.jpg
000433.jpg
000481.jpg
000520.jpg
000541.jpg
000698.jpg
000740.jpg
000869.jpg
000889.jpg

In [101]:
filename="000404.jpg"

row = df[df["filename"] == filename]
ground_truth = row.iloc[0]["caption"]

inference_outputs = []
vram_usages = []
inference_times = []

prompts = [
    "",  # Prompt 1: No Prompt
    "Describe &&damage 12 sedan drive‚Äô this !!image.",  # Noisy
    "An image of a damaged car parked on the side of the road.",  # Hand-crafted
    "You are an insurance claims assessor. Provide a detailed description of the car‚Äôs condition.",  # Roleplay
    "This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. Additional notes: <text_1>.",  # Masked
    "Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___"  # Format-Guided
]

for prompt in prompts:
    output, time_taken, vram = run_vlm_inference(prompt, filename, df=df)
    inference_outputs.append(output)
    vram_usages.append(vram)
    inference_times.append(time_taken)

üîπ Image: 000404.jpg
üßæ Prompt: 
üì§ Output:
The image shows the front end of a maroon car that has been involved in a collision. The hood is crumpled, and there are visible dents and damage to the grille and headlight area. The bumper appears to be broken or severely damaged. In the background, another vehicle can be seen with significant damage as well, indicating that this is likely a multi-vehicle accident. The scene suggests that emergency services may be on the way or have recently responded to the incident.<|im_end|>
‚è±Ô∏è Time taken: 5.515 sec | üß† VRAM used: 0.343 GB
--------------------------------------------------------------------------------
üîπ Image: 000404.jpg
üßæ Prompt: Describe &&damage 12 sedan drive‚Äô this !!image.
üì§ Output:
The image shows a maroon sedan that has sustained significant damage in what appears to be a collision. The front end of the car is severely compromised, with the hood crumpled and the bumper completely detached. The grille is be

In [85]:
from sentence_transformers import SentenceTransformer, util
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
import pandas as pd

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [86]:
def compute_cosine_similarity(reference_captions, generated_caption):
    try:
        total_score = 0.0
        for caption in reference_captions:
            ref_embed = sbert_model.encode(caption, convert_to_tensor=True)
            gen_embed = sbert_model.encode(generated_caption, convert_to_tensor=True)
            score = util.cos_sim(gen_embed, ref_embed).item()
            total_score += score
        avg_score = total_score / len(reference_captions) if reference_captions else 0.0
        return avg_score
    except Exception as e:
        print(f"Error computing cosine similarity: {e}")
        return 0.0

In [107]:
def compute_cider_spice_scores(reference_caption, generated_caption):
    refs = {0: [reference_caption if reference_caption else ""]}
    hypos = {0: [generated_caption if generated_caption else ""]}

    # print(f"Generated caption: {generated_caption}")
    # print(f"Generated hypos: {hypos}")

    ptb = PTBTokenizer()
    refs_tok = ptb.tokenize({i: [{"caption": c} for c in caps] for i, caps in refs.items()})
    hypos_tok = ptb.tokenize({i: [{"caption": hypos[i][0]}] for i in hypos})

    all_scores = {}

    for scorer, name in [(Cider(), "CIDEr"), (Spice(), "SPICE")]:
        try:
            avg_score, _ = scorer.compute_score(refs_tok, hypos_tok)
            if name == "SPICE":
                # SPICE returns dicts per image
                all_scores[name] = avg_score.get("All", {}).get("f", 0.0) if isinstance(avg_score, dict) else avg_score
            else:
                all_scores[name] = avg_score
        except Exception as e:
            print(f"[ERROR] {name} scoring failed: {e}")
            all_scores[name] = 0.0

    return all_scores

In [118]:
from cider import Cider

PICKLE_PATH = "/workspace/data/cardd-df.p"

def compute_cider2_score(reference_caption, generated_caption):
    """
    Computes CIDEr score using new evaluate_cider logic with precomputed DF pickle.

    Args:
        reference_caption (str or list): Ground truth caption(s)
        generated_caption (str): Model output

    Returns:
        float: CIDEr score (averaged if multiple refs)
    """
    refs = {"0": reference_caption if isinstance(reference_caption, list) else [reference_caption]}
    hypos = [{"image_id": "0", "caption": [generated_caption]}]  # Fix: caption should be a list

    cider = Cider()
    score, _ = cider.compute_score(refs, hypos, PICKLE_PATH)

    return score

In [116]:
def evaluate_all_metrics(reference_caption, generated_caption):
    cider_spice_scores = compute_cider_spice_scores(reference_caption, generated_caption)
    cosine_sim = compute_cosine_similarity([reference_caption], generated_caption)
    cider2 = compute_cider2_score(reference_caption, generated_caption)

    return {
        "cosine_similarity": round(cosine_sim, 4),
        #"CIDEr": round(cider_spice_scores.get("CIDEr", 0.0), 4),
        "SPICE": round(cider_spice_scores.get("SPICE", 0.0), 4),
        "CIDEr": round(cider2, 4)
    }

In [110]:
print(ground_truth)
print(inference_outputs)
print(vram_usages)
print(inference_times)

The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housing shattered and the bulb exposed. The front bumper also has a dent near the grille, and the grille itself appears to be slightly misaligned. The passenger side headlight is intact but the surrounding area has a dent. The tire on the driver's side appears to be flat. No scratches, cracks, or shattered glass are present. The overall condition of the car is severely compromised due to these damages.
['system\nYou are a helpful assistant.\nuser\n\nassistant\nThe image shows the front end of a maroon car that has been involved in a collision. The hood is crumpled, and there are visible dents and damage to the grille and headlight area. The bumper appears to be broken or severely damaged. In the background, another vehicle can be seen with significant damage as well, indicating that this is likely a multi-vehic

In [119]:
all_scores = []

for i, gen in enumerate(inference_outputs):
    print(f"\nPrompt {i+1} Output:\n{gen}\nReference:\n{ground_truth}")
    scores = evaluate_all_metrics(ground_truth, gen)
    all_scores.append(scores)


Prompt 1 Output:
system
You are a helpful assistant.
user

assistant
The image shows the front end of a maroon car that has been involved in a collision. The hood is crumpled, and there are visible dents and damage to the grille and headlight area. The bumper appears to be broken or severely damaged. In the background, another vehicle can be seen with significant damage as well, indicating that this is likely a multi-vehicle accident. The scene suggests that emergency services may be on the way or have recently responded to the incident.
Reference:
The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housing shattered and the bulb exposed. The front bumper also has a dent near the grille, and the grille itself appears to be slightly misaligned. The passenger side headlight is intact but the surrounding area has a dent. The tire on the driver's side appears to be

PTBTokenizer tokenized 112 tokens at 2061.73 tokens per second.
PTBTokenizer tokenized 99 tokens at 2409.41 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6 sec].
Loading classif

SPICE evaluation took: 15.30 s

Prompt 2 Output:
system
You are a helpful assistant.
user
Describe &&damage 12 sedan drive‚Äô this !!image.
assistant
The image shows a maroon sedan that has sustained significant damage in what appears to be a collision. The front end of the car is severely compromised, with the hood crumpled and the bumper completely detached. The grille is bent and broken, and the headlights are shattered, indicating severe impact. The hood is lifted, revealing the engine compartment, which also seems to have been affected by the crash. The overall condition of the vehicle suggests it was involved in a high-impact accident, resulting in extensive damage to the front section of the car.
Reference:
The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housing shattered and the bulb exposed. The front bumper also has a dent near the grille, and the 

PTBTokenizer tokenized 112 tokens at 2714.85 tokens per second.
PTBTokenizer tokenized 125 tokens at 2589.70 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5 sec].
Loading classif

SPICE evaluation took: 16.42 s

Prompt 3 Output:
system
You are a helpful assistant.
user
An image of a damaged car parked on the side of the road.
assistant
The image shows a maroon-colored car that has sustained significant damage, likely from a collision. The front end of the vehicle is severely compromised, with the hood crumpled and the bumper and grille visibly bent or broken. The headlights appear to be intact but may have been affected by the impact. There is another vehicle partially visible in the background, suggesting that this might be the aftermath of a multi-vehicle accident.

If you were involved in an accident like this, it's important to follow these steps:

1. **Ensure Safety**: Move to a safe location if possible, and call emergency services if there are injuries.
2. **
Reference:
The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housing sh

PTBTokenizer tokenized 112 tokens at 2745.30 tokens per second.
PTBTokenizer tokenized 146 tokens at 3228.36 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.0 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.4 sec].
Loading classif

SPICE evaluation took: 19.60 s

Prompt 4 Output:
system
You are a helpful assistant.
user
You are an insurance claims assessor. Provide a detailed description of the car‚Äôs condition.
assistant
The image shows a maroon-colored sedan that has sustained significant damage in what appears to be a collision. Here is a detailed assessment of the car's condition:

1. **Front End Damage:**
   - The front bumper and grille are severely damaged, with visible cracks and deformations.
   - The headlights are shattered, indicating they have been broken by impact or have been damaged beyond repair.
   - The hood is crumpled and dented, particularly on the left side near the windshield area, suggesting a forceful impact from the front.

2. **Bodywork:**
   - The hood exhibits extensive dents and scratches,
Reference:
The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housin

PTBTokenizer tokenized 112 tokens at 1812.39 tokens per second.
PTBTokenizer tokenized 141 tokens at 3275.11 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6 sec].
Loading classif

SPICE evaluation took: 18.53 s

Prompt 5 Output:
system
You are a helpful assistant.
user
This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. Additional notes: <text_1>.
assistant
The front part of the car in the image has significant damage, specifically to the hood and the front bumper. The severity of the damage appears to be moderate to severe, as there are noticeable dents and deformations on the hood and the front grille area. Additionally, the headlight is cracked, indicating further damage. The additional notes suggest that the car may require repairs or replacement parts for the damaged components.
Reference:
The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housing shattered and the bulb exposed. The front bumper also has a dent near the grille, and the grille itself appears to be slightly misaligned. The passenger 

PTBTokenizer tokenized 112 tokens at 1858.05 tokens per second.
PTBTokenizer tokenized 108 tokens at 2396.24 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6 sec].
Loading classif

SPICE evaluation took: 15.85 s

Prompt 6 Output:
system
You are a helpful assistant.
user
Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___
assistant
Damage Type: Bumper and Front End; Affected Part: Bumper, Grille, Headlight, and Hood; Severity: Moderate; Notes: The vehicle has significant dents and damage to the front bumper, grille, headlight, and hood. The paint is chipped in several areas, indicating a moderate level of impact.
Reference:
The car exhibits significant damage. The front hood area shows a large dent with visible deformation and creasing. The headlight on the passenger's side is broken, with the lamp housing shattered and the bulb exposed. The front bumper also has a dent near the grille, and the grille itself appears to be slightly misaligned. The passenger side headlight is intact but the surrounding area has a dent. The tire on the driver's side appears to be flat. No scratches, cracks, or shattered glass are present. The overa

PTBTokenizer tokenized 112 tokens at 2572.20 tokens per second.
PTBTokenizer tokenized 90 tokens at 1851.71 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6 sec].
Loading classif

SPICE evaluation took: 12.06 s


In [120]:
score_df = pd.DataFrame(all_scores, index=[f"Prompt {i+1}" for i in range(len(inference_outputs))])
score_df

Unnamed: 0,cosine_similarity,SPICE,CIDEr
Prompt 1,0.6954,0.1481,1.9053
Prompt 2,0.6801,0.1915,2.5253
Prompt 3,0.7187,0.1237,1.8957
Prompt 4,0.7304,0.26,1.8604
Prompt 5,0.7942,0.2198,2.4488
Prompt 6,0.6262,0.1111,0.8573


In [121]:
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Alignment
from openpyxl.drawing.image import Image as ExcelImage
from openpyxl.utils import get_column_letter
import os

def log_prompt_metrics_to_excel(
    filepath: str,
    model_name: str,
    image_number: str,
    original_caption: str,
    image_path: str,
    prompt_outputs: list,
    scores: list,
    vram_usages: list,
    inference_times: list
):
    if not os.path.exists(filepath):
        wb = Workbook()
        ws = wb.active
        ws.title = "Prompt Metrics"
        headers = [
            "Model", "Image Number", "Image", "Original Caption",
            "prompt", "output", "cosine_similarity", "CIDEr", "SPICE", "time", "vram"
        ]
        ws.append(headers)
    else:
        wb = load_workbook(filepath)
        ws = wb["Prompt Metrics"]

    start_row = ws.max_row + 1
    end_row = start_row + len(prompt_outputs) - 1

    # Write prompt rows
    for i in range(len(prompt_outputs)):
        row_idx = start_row + i
        ws.cell(row=row_idx, column=5, value=f"output_{i+1}")
        ws.cell(row=row_idx, column=6, value=prompt_outputs[i])
        ws.cell(row=row_idx, column=7, value=scores[i].get("cosine_similarity"))
        ws.cell(row=row_idx, column=8, value=scores[i].get("CIDEr"))
        ws.cell(row=row_idx, column=9, value=scores[i].get("SPICE"))
        ws.cell(row=row_idx, column=10, value=inference_times[i])
        ws.cell(row=row_idx, column=11, value=vram_usages[i])
        ws.cell(row=row_idx, column=6).alignment = Alignment(wrap_text=True, vertical="top")

    # Merge vertically for columns 1‚Äì4
    merge_cols = {
        1: model_name,
        2: image_number,
        4: original_caption,
    }

    for col, value in merge_cols.items():
        ws.merge_cells(start_row=start_row, start_column=col, end_row=end_row, end_column=col)
        cell = ws.cell(row=start_row, column=col)
        cell.value = value
        cell.alignment = Alignment(wrap_text=True, vertical="top")

    # Column 3 (Image): insert image and merge
    ws.merge_cells(start_row=start_row, start_column=3, end_row=end_row, end_column=3)
    if os.path.exists(image_path):
        try:
            img = ExcelImage(image_path)
            img.width = 150
            img.height = 150
            ws.add_image(img, f"C{start_row}")
        except Exception as e:
            print(f"[ERROR] Could not insert image: {e}")

    # Auto-width adjustment
    for col in range(1, 12):
        col_letter = get_column_letter(col)
        ws.column_dimensions[col_letter].width = 30 if col in [4, 6] else 15

    wb.save(filepath)
    print(f"[SAVED] Metrics logged with merged vertical cells to {filepath}")

In [122]:
log_prompt_metrics_to_excel(
    filepath="Ablation_Prompt_Results.xlsx",
    model_name=model_id,
    image_number=filename,
    original_caption=df.iloc[0]["caption"],
    image_path=os.path.join("/workspace/data/test_dataset", df.iloc[0]["filename"]),
    prompt_outputs=inference_outputs,
    scores=all_scores,
    vram_usages=vram_usages,
    inference_times=inference_times
)

[SAVED] Metrics logged with merged vertical cells to Ablation_Prompt_Results.xlsx
