In [184]:
import os
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel
from transformers import AutoProcessor, TextStreamer
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [185]:
def create_image_caption_dataset(
    image_folder: str,
    captions_json: str,
    caption_strategy: str = 'first'
) -> pd.DataFrame:
    with open(captions_json, 'r') as f:
        captions_data = json.load(f)

    data = []
    for filename, caption_list in captions_data.items():
        image_path = os.path.join(image_folder, filename)
        if not os.path.exists(image_path):
            continue
        try:
            image = Image.open(image_path).convert("RGB")
            caption = caption_list if caption_strategy == 'first' else random.choice(caption_list)
            data.append({"image": image, "caption": caption, "filename": filename})
        except Exception as e:
            print(f"[ERROR] Could not load {filename}: {e}")
    return pd.DataFrame(data)

In [3]:
model_id = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"  # change if needed
#unsloth/Pixtral-12B-2409-bnb-4bit

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    dtype = torch.float16,
    load_in_4bit = True,
    device_map = "auto"
)

model = FastLanguageModel.for_inference(model)
processor = AutoProcessor.from_pretrained(model_id)

==((====))==  Unsloth 2025.4.7: Fast Qwen2_5_Vl patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [26]:
image_folder = "/workspace/data/test_dataset"
captions_json = "/workspace/data/test_set.json"

df = create_image_caption_dataset(image_folder, captions_json)
print(f"Loaded {len(df)} image-caption pairs")

Loaded 50 image-caption pairs


In [186]:
print("📌 First 5 entries:")
print(df.head(), "\n")

print("📋 Column types:")
print(df.dtypes)

print("\n🔎 Size")
print(df.shape)

📌 First 5 entries:
                                               image  \
0  <PIL.Image.Image image mode=RGB size=1000x667 ...   
1  <PIL.Image.Image image mode=RGB size=1000x667 ...   
2  <PIL.Image.Image image mode=RGB size=1000x667 ...   
3  <PIL.Image.Image image mode=RGB size=1000x750 ...   
4  <PIL.Image.Image image mode=RGB size=1000x667 ...   

                                             caption    filename  
0  The car exhibits visible damage including a sc...  000481.jpg  
1  The car exhibits significant damage to the win...  000433.jpg  
2  The car exhibits a flat tire on the rear wheel...  000749.jpg  
3  The car exhibits significant damage. The rear ...  000541.jpg  
4  The car shows scratches along the side panel a...  000424.jpg   

📋 Column types:
image       object
caption     object
filename    object
dtype: object

🔎 Size
(50, 3)


In [187]:
import torch
import time
from transformers import TextStreamer
import pandas as pd

streamer = TextStreamer(tokenizer, skip_prompt=True)

def run_vlm_inference(prompt: str, filename: str, df: pd.DataFrame):
    """
    Perform inference on a given image (by filename) from the dataframe using a custom prompt.

    Args:
        prompt (str): The prompt text (can include mask tokens)
        filename (str): Filename of the image in the DataFrame
        df (pd.DataFrame): DataFrame with 'filename' and 'image' columns

    Returns:
        Tuple[str, float, float]: (Generated output, inference time in seconds, VRAM used in GB)
    """
    row = df[df["filename"] == filename]
    if row.empty:
        print(f"[ERROR] No image found with filename: {filename}")
        return None, 0.0, 0.0

    row = row.iloc[0]
    image = row["image"]
    caption = row["caption"]

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": image}
            ]
        }
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to("cuda")

    torch.cuda.reset_peak_memory_stats()
    start_mem = torch.cuda.memory_allocated() / 1024 / 1024 / 1024  # in GB
    start_time = time.time()

    print(f"🔹 Image: {filename}")
    print(f"🧾 Prompt: {prompt}")
    print("📤 Output:")
    
    # Perform inference
    output_ids = model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.0,
        top_p=0.95
    )

    end_time = time.time()
    end_mem = torch.cuda.max_memory_allocated() / 1024**3

    time_taken = round(end_time - start_time, 3)
    vram_used = round(end_mem - start_mem, 3)
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"⏱️ Time taken: {time_taken} sec | 🧠 VRAM used: {vram_used} GB")
    print("-" * 80)

    return decoded_output, time_taken, vram_used

### Test Images

000404.jpg
000422.jpg
000433.jpg
000481.jpg
000520.jpg
000541.jpg
000698.jpg
000740.jpg
000869.jpg

In [297]:
filename="000889.jpg"

row = df[df["filename"] == filename]
ground_truth = row.iloc[0]["caption"]

inference_outputs = []
vram_usages = []
inference_times = []

prompts = [
    "",  # Prompt 1: No Prompt
    "Describe &&damage 12 sedan drive’ this !!image.",  # Noisy
    "An image of a damaged car parked on the side of the road.",  # Hand-crafted
    "You are an insurance claims assessor. Provide a detailed description of the car’s condition.",  # Roleplay
    "This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. Additional notes: <text_1>.",  # Masked
    "Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___"  # Format-Guided
]

for prompt in prompts:
    output, time_taken, vram = run_vlm_inference(prompt, filename, df=df)
    inference_outputs.append(output)
    vram_usages.append(vram)
    inference_times.append(time_taken)

🔹 Image: 000889.jpg
🧾 Prompt: 
📤 Output:
The image shows the side window of a car that has been shattered, likely due to an impact or explosion. The glass is broken into small pieces, and some of it appears to be still adhering to the window frame. The interior of the car is partially visible through the broken window, showing the steering wheel and dashboard. The background outside the car includes trees and a cloudy sky, suggesting the photo was taken outdoors on a day with overcast weather.<|im_end|>
⏱️ Time taken: 5.082 sec | 🧠 VRAM used: 0.343 GB
--------------------------------------------------------------------------------
🔹 Image: 000889.jpg
🧾 Prompt: Describe &&damage 12 sedan drive’ this !!image.
📤 Output:
The image shows the side window of a sedan that has been severely damaged. The glass is shattered into numerous pieces, with large chunks missing and the remaining glass broken into smaller fragments. The damage appears to be extensive, likely caused by an impact or explos

In [189]:
from sentence_transformers import SentenceTransformer, util
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
import pandas as pd

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [190]:
def compute_cosine_similarity(reference_captions, generated_caption):
    try:
        total_score = 0.0
        for caption in reference_captions:
            ref_embed = sbert_model.encode(caption, convert_to_tensor=True)
            gen_embed = sbert_model.encode(generated_caption, convert_to_tensor=True)
            score = util.cos_sim(gen_embed, ref_embed).item()
            total_score += score
        avg_score = total_score / len(reference_captions) if reference_captions else 0.0
        return avg_score
    except Exception as e:
        print(f"Error computing cosine similarity: {e}")
        return 0.0

In [191]:
def compute_cider_spice_scores(reference_caption, generated_caption):
    refs = {0: [reference_caption if reference_caption else ""]}
    hypos = {0: [generated_caption if generated_caption else ""]}

    # print(f"Generated caption: {generated_caption}")
    # print(f"Generated hypos: {hypos}")

    ptb = PTBTokenizer()
    refs_tok = ptb.tokenize({i: [{"caption": c} for c in caps] for i, caps in refs.items()})
    hypos_tok = ptb.tokenize({i: [{"caption": hypos[i][0]}] for i in hypos})

    all_scores = {}

    for scorer, name in [(Cider(), "CIDEr"), (Spice(), "SPICE")]:
        try:
            avg_score, _ = scorer.compute_score(refs_tok, hypos_tok)
            if name == "SPICE":
                # SPICE returns dicts per image
                all_scores[name] = avg_score.get("All", {}).get("f", 0.0) if isinstance(avg_score, dict) else avg_score
            else:
                all_scores[name] = avg_score
        except Exception as e:
            print(f"[ERROR] {name} scoring failed: {e}")
            all_scores[name] = 0.0

    return all_scores

In [192]:
from cider import Cider

PICKLE_PATH = "/workspace/data/cardd-df.p"

def compute_cider2_score(reference_caption, generated_caption):
    """
    Computes CIDEr score using new evaluate_cider logic with precomputed DF pickle.

    Args:
        reference_caption (str or list): Ground truth caption(s)
        generated_caption (str): Model output

    Returns:
        float: CIDEr score (averaged if multiple refs)
    """
    refs = {"0": reference_caption if isinstance(reference_caption, list) else [reference_caption]}
    hypos = [{"image_id": "0", "caption": [generated_caption]}]  # Fix: caption should be a list

    cider = Cider()
    score, _ = cider.compute_score(refs, hypos, PICKLE_PATH)

    return score

In [193]:
def evaluate_all_metrics(reference_caption, generated_caption):
    cider_spice_scores = compute_cider_spice_scores(reference_caption, generated_caption)
    cosine_sim = compute_cosine_similarity([reference_caption], generated_caption)
    cider2 = compute_cider2_score(reference_caption, generated_caption)

    return {
        "cosine_similarity": round(cosine_sim, 4),
        #"CIDEr": round(cider_spice_scores.get("CIDEr", 0.0), 4),
        "SPICE": round(cider_spice_scores.get("SPICE", 0.0), 4),
        "CIDEr": round(cider2, 4)
    }

In [298]:
print(ground_truth)
print(inference_outputs)
print(vram_usages)
print(inference_times)

The car exhibits significant damage to the driver's side window, which is shattered extensively. The glass is fragmented and broken, with large pieces missing, indicating a severe impact. 
['system\nYou are a helpful assistant.\nuser\n\nassistant\nThe image shows the side window of a car that has been shattered, likely due to an impact or explosion. The glass is broken into small pieces, and some of it appears to be still adhering to the window frame. The interior of the car is partially visible through the broken window, showing the steering wheel and dashboard. The background outside the car includes trees and a cloudy sky, suggesting the photo was taken outdoors on a day with overcast weather.', 'system\nYou are a helpful assistant.\nuser\nDescribe &&damage 12 sedan drive’ this !!image.\nassistant\nThe image shows the side window of a sedan that has been severely damaged. The glass is shattered into numerous pieces, with large chunks missing and the remaining glass broken into small

In [299]:
all_scores = []

for i, gen in enumerate(inference_outputs):
    print(f"\nPrompt {i+1} Output:\n{gen}\nReference:\n{ground_truth}")
    scores = evaluate_all_metrics(ground_truth, gen)
    all_scores.append(scores)


Prompt 1 Output:
system
You are a helpful assistant.
user

assistant
The image shows the side window of a car that has been shattered, likely due to an impact or explosion. The glass is broken into small pieces, and some of it appears to be still adhering to the window frame. The interior of the car is partially visible through the broken window, showing the steering wheel and dashboard. The background outside the car includes trees and a cloudy sky, suggesting the photo was taken outdoors on a day with overcast weather.
Reference:
The car exhibits significant damage to the driver's side window, which is shattered extensively. The glass is fragmented and broken, with large pieces missing, indicating a severe impact. 


PTBTokenizer tokenized 34 tokens at 553.38 tokens per second.
PTBTokenizer tokenized 98 tokens at 2178.72 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Parsing test captions


SPICE evaluation took: 750.4 ms

Prompt 2 Output:
system
You are a helpful assistant.
user
Describe &&damage 12 sedan drive’ this !!image.
assistant
The image shows the side window of a sedan that has been severely damaged. The glass is shattered into numerous pieces, with large chunks missing and the remaining glass broken into smaller fragments. The damage appears to be extensive, likely caused by an impact or explosion. The interior of the car is partially visible through the broken window, showing the steering wheel and dashboard. The overall condition of the vehicle suggests significant structural damage beyond just the window.
Reference:
The car exhibits significant damage to the driver's side window, which is shattered extensively. The glass is fragmented and broken, with large pieces missing, indicating a severe impact. 


PTBTokenizer tokenized 34 tokens at 733.48 tokens per second.
PTBTokenizer tokenized 107 tokens at 2108.39 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6

SPICE evaluation took: 9.464 s

Prompt 3 Output:
system
You are a helpful assistant.
user
An image of a damaged car parked on the side of the road.
assistant
The image shows a car with significant damage to its side window, which appears to be shattered into pieces. The broken glass is scattered across the window and the surrounding area. This type of damage could be caused by various factors such as vandalism, an accident, or an impact from an object. If this were your vehicle, it would be advisable to contact insurance to report the damage and arrange for repairs. Additionally, ensure that the vehicle is safe to drive before attempting to move it.
Reference:
The car exhibits significant damage to the driver's side window, which is shattered extensively. The glass is fragmented and broken, with large pieces missing, indicating a severe impact. 


PTBTokenizer tokenized 34 tokens at 848.23 tokens per second.
PTBTokenizer tokenized 119 tokens at 2897.14 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5

SPICE evaluation took: 13.52 s

Prompt 4 Output:
system
You are a helpful assistant.
user
You are an insurance claims assessor. Provide a detailed description of the car’s condition.
assistant
The image shows a vehicle with significant damage to its rear passenger-side window. The glass is shattered, with large pieces of broken glass scattered across the interior and exterior surfaces of the window frame. The damage appears to be extensive, likely caused by an impact or explosion. The surrounding area of the window frame looks intact, suggesting that the breakage did not extend into the structural components of the car.

The steering wheel and dashboard are visible through the broken window, indicating that the driver's side window is also affected but not as severely as the rear passenger-side window. There are no visible signs of damage to the car's bodywork outside the window
Reference:
The car exhibits significant damage to the driver's side window, which is shattered extensively. 

PTBTokenizer tokenized 34 tokens at 784.22 tokens per second.
PTBTokenizer tokenized 150 tokens at 2743.64 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5

SPICE evaluation took: 16.26 s

Prompt 5 Output:
system
You are a helpful assistant.
user
This <part_1> of the car has <damage_type_1>. The severity appears to be <severity_1>. Additional notes: <text_1>.
assistant
The part of the car shown in the image is the side window, which has been shattered. The severity of the damage appears to be significant, as the glass is completely broken and scattered across the interior of the car. Additional notes include that the damage seems to have occurred recently, possibly due to an impact or explosion, given the sharp edges of the broken glass.
Reference:
The car exhibits significant damage to the driver's side window, which is shattered extensively. The glass is fragmented and broken, with large pieces missing, indicating a severe impact. 


PTBTokenizer tokenized 34 tokens at 726.60 tokens per second.
PTBTokenizer tokenized 102 tokens at 1818.95 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5

SPICE evaluation took: 10.30 s

Prompt 6 Output:
system
You are a helpful assistant.
user
Describe using format - Damage Type: ___; Affected Part: ___; Severity: ___; Notes: ___
assistant
Damage Type: Shattering; Affected Part: Rear Window; Severity: Moderate; Notes: The rear window has a significant crack and shattered glass, indicating a moderate level of damage likely caused by an impact or explosion.
Reference:
The car exhibits significant damage to the driver's side window, which is shattered extensively. The glass is fragmented and broken, with large pieces missing, indicating a severe impact. 


PTBTokenizer tokenized 34 tokens at 733.14 tokens per second.
PTBTokenizer tokenized 72 tokens at 1777.53 tokens per second.


[ERROR] CIDEr scoring failed: Cider.compute_score() missing 1 required positional argument: 'pfile_path'


Parsing reference captions
Parsing test captions


SPICE evaluation took: 756.5 ms


In [300]:
score_df = pd.DataFrame(all_scores, index=[f"Prompt {i+1}" for i in range(len(inference_outputs))])
score_df

Unnamed: 0,cosine_similarity,SPICE,CIDEr
Prompt 1,0.7445,0.2143,1.3744
Prompt 2,0.8491,0.2769,1.479
Prompt 3,0.7759,0.2581,1.491
Prompt 4,0.8,0.2558,1.6658
Prompt 5,0.8208,0.2642,1.1502
Prompt 6,0.6326,0.1356,0.5604


In [197]:
import pandas as pd
from openpyxl import Workbook, load_workbook
from openpyxl.drawing.image import Image as ExcelImage
from openpyxl.styles import Alignment
from PIL import Image as PILImage
import os

def log_prompt_metrics_to_excel(
    filename: str,
    model_name: str,
    inference_outputs: list,
    metrics: list,
    inference_times: list,
    vram_usages: list,
    df: pd.DataFrame,
    output_excel_path: str = "prompt_tuning_results_cardd_qwen.xlsx"
):
    row = df[df["filename"] == filename].iloc[0]
    original_caption = row["caption"]
    image_path = os.path.join("/workspace/data/test_dataset", filename)

    # Load or create workbook
    if os.path.exists(output_excel_path):
        wb = load_workbook(output_excel_path)
        ws = wb.active
    else:
        wb = Workbook()
        ws = wb.active
        ws.title = "Prompt Evaluation"
        headers = [
            "Model", "Image Number", "Image", "Original Caption",
            "Prompt", "Output", "Inference Time (s)", "VRAM Used (GB)",
            "CIDEr", "SPICE", "Cosine Similarity"
        ]
        ws.append(headers)

    # Find the next empty row
    start_row = ws.max_row + 1

    # Append data
    for i in range(6):
        ws.append([
            model_name,
            filename,
            "",  # placeholder for image
            original_caption,
            f"Prompt {i+1}",
            inference_outputs[i],
            inference_times[i],
            vram_usages[i],
            metrics[i]["CIDEr"],
            metrics[i]["SPICE"],
            metrics[i]["cosine_similarity"]
        ])

    # Merge A–D columns across the 6 rows
    for col in ["A", "B", "C", "D"]:
        ws.merge_cells(f"{col}{start_row}:{col}{start_row + 5}")

    # Apply alignment
    align_top_wrap = Alignment(wrap_text=True, vertical="top")
    align_top = Alignment(vertical="top")

    for row in range(start_row, start_row + 6):
        for col_letter in ["A", "B", "C", "E"]:
            ws[f"{col_letter}{row}"].alignment = align_top
        for col_letter in ["D", "F"]:
            ws[f"{col_letter}{row}"].alignment = align_top_wrap
        ws.row_dimensions[row].height = 120

    # Set column widths
    ws.column_dimensions["D"].width = 40  # Original Caption
    ws.column_dimensions["F"].width = 40  # Output

    # Embed image
    if os.path.exists(image_path):
        print(f"[INFO] Inserting image: {image_path}")
        try:
            img = ExcelImage(image_path)
            img.width = 150
            img.height = 150
            img.anchor = f"C{start_row}"
            ws.add_image(img)
        except Exception as e:
            print(f"[ERROR] Could not insert image: {e}")

    wb.save(output_excel_path)
    print(f"✅ Logged metrics for {filename} to {output_excel_path}")

In [301]:
log_prompt_metrics_to_excel(
    filename=filename,
    model_name=model_id,
    inference_outputs=inference_outputs,
    metrics=all_scores,
    inference_times=inference_times,
    vram_usages=vram_usages,
    df=df
)

[INFO] Inserting image: /workspace/data/test_dataset/000889.jpg
✅ Logged metrics for 000889.jpg to prompt_tuning_results_cardd_qwen.xlsx
