In [16]:
import os
import pandas as pd

In [17]:
exp_root = "/media/yesindeed/DATADRIVE1/mount/remote_cse/experiments/med_vlm_benchmark/merged"

datasets = {
    "SLAKE": "/research/d5/gds/yzhong22/datasets/SLAKE/imgs",
    "PathVQA": "None",
    "VQA-RAD": "None",
    "Harvard-FairVLMed10k": "/research/d5/gds/yzhong22/datasets/Harvard-FairVLMed10k",
}

In [18]:
df_index = pd.read_csv(os.path.join(exp_root, "exp_status.csv"))
df_index

Unnamed: 0,model,task,dataset,model_type,trainable_module,path,have_eval_result,have_prediction,have_gpt_score,model_family
0,Qwen2-VL,vqa,SLAKE,general,,vqa/SLAKE/Qwen2-VL/eval_seed0/Qwen2-VL-7B-Inst...,1,1,1,Qwen
1,Qwen25-VL,vqa,SLAKE,general,,vqa/SLAKE/Qwen25-VL/eval_seed0/Qwen2.5-VL-7B-I...,1,1,1,Qwen
2,Gemma3,vqa,SLAKE,general,,vqa/SLAKE/Gemma3/eval_seed0/gemma-3-4b-it,1,1,1,Gemma
3,MedGemma,vqa,SLAKE,medical,,vqa/SLAKE/MedGemma/eval_seed0/medgemma-4b-it,1,1,1,Gemma
4,InternVL3,vqa,SLAKE,general,,vqa/SLAKE/InternVL3/eval_seed0/InternVL3-8B-hf,1,1,1,Intern
...,...,...,...,...,...,...,...,...,...,...
83,VILA,vqa,VQA-RAD,general,ML,vqa/VQA-RAD/VILA1.5/eval_seed0/1epoch-lora8,1,1,1,VILA
84,VILA-M3,vqa,VQA-RAD,medical,ML,vqa/VQA-RAD/VILA-M3/eval_seed0/1epoch-lora8,1,1,1,VILA
85,Lingshu,vqa,Harvard-FairVLMed10k,medical,ML,vqa/Harvard-FairVLMed10k/Lingshu/eval_seed0/1e...,1,1,1,Qwen
86,VILA,vqa,Harvard-FairVLMed10k,general,ML,vqa/Harvard-FairVLMed10k/VILA1.5/eval_seed0/1e...,1,1,1,VILA


In [19]:
unfinished_df = df_index.loc[
    ~((df_index["have_eval_result"] == 1) & (df_index["have_prediction"] == 1) & (df_index["have_gpt_score"] == 1))
]
unfinished_df

Unnamed: 0,model,task,dataset,model_type,trainable_module,path,have_eval_result,have_prediction,have_gpt_score,model_family


In [20]:
from pathlib import Path
import sys

sys.path.append("../../../")

In [21]:
from eval.utils import normalize_word
from eval.metrics import (
    calculate_exactmatch,
    calculate_f1score,
    calculate_appearance_with_normalization,
    calculate_bertscore,
    calculate_meteor,
)
from torchmetrics.functional.text import bleu_score, rouge_score


def process_tokens(text):
    tokenized_text = set(text.split())
    tokenized_text.discard("")
    return tokenized_text

In [22]:
open_metrics = [
    "bleu1",
    "bleu2",
    "bleu3",
    "bleu4",
    "rouge1",
    "rouge2",
    "rougeL",
    "exact_match",
    "recall",
    "precision",
    "f1_score",
    "accuracy",
    # "bertscore",
    # "meteor",
    "gpt_score",
]


closed_metrics = [
    "exact_match",
    "recall",
    "precision",
    "f1_score",
    "accuracy",
]

overall_metrics = ["exact_match", "recall", "precision", "f1_score", "gpt_score"]


all_metrics = (
    [f"{x}_open" for x in open_metrics]
    + [f"{x}_closed" for x in closed_metrics]
    + [f"{x}_overall" for x in overall_metrics]
)

In [23]:
import tqdm
import json
import numpy as np

skip_list = ["PathVQA/o3/eval_seed0/none"]

all_results = []

for i in tqdm.tqdm(range(len(df_index))):
    item = df_index.iloc[i]

    path = item["path"]
    save_file = os.path.join(exp_root, path, "each_example_result.csv")

    if any([x in save_file for x in skip_list]):
        continue

    if os.path.exists(save_file):
        continue

    if not item["have_prediction"]:
        continue

    all_results = []

    with open(os.path.join(exp_root, path, "predictions.json"), "r") as file:
        # Load the JSON data from the file
        predictions = json.load(file)

    if item["have_gpt_score"]:
        with open(os.path.join(exp_root, path, "deekseep_review.json"), "r") as file:
            # Load the JSON data from the file
            gpt_scores = json.load(file)

        assert len(predictions) == len(gpt_scores), path
    else:
        gpt_scores = None

    for i_case, pred in enumerate(predictions):
        output = pred["prediction"]
        answer = pred["answer"]
        question_type = pred["question_type"]

        output_l, answer_l = output.lower(), answer.lower()

        output_normed = normalize_word(output_l)
        answer_normed = normalize_word(answer_l)

        case_dict = {"question_type": question_type}
        for k in all_metrics:
            case_dict[k] = np.nan

        f1_score, precision, recall = calculate_f1score(output_l, answer_l)
        exact_match = calculate_exactmatch(output_l, answer_l)

        if question_type == "open":
            bleu1 = bleu_score([output_normed], [
                               [answer_normed]], n_gram=1).item()
            bleu2 = bleu_score([output_normed], [
                               [answer_normed]], n_gram=2).item()
            bleu3 = bleu_score([output_normed], [
                               [answer_normed]], n_gram=3).item()
            bleu4 = bleu_score([output_normed], [
                               [answer_normed]], n_gram=4).item()
            rouge_scores = rouge_score(output_normed, answer_normed)
            rouge1, rouge2, rougeL = (
                rouge_scores["rouge1_fmeasure"].item(),
                rouge_scores["rouge2_fmeasure"].item(),
                rouge_scores["rougeL_fmeasure"].item(),
            )
            # accuracy = calculate_appearance_with_normalization(output_l, answer_l)
            accuracy = float(recall >= 0.75)
            # bertscore = calculate_bertscore(output_normed, answer_normed)
            # meteor = calculate_meteor(output_normed, answer_normed)

            if gpt_scores is not None:
                gpt_score = float(gpt_scores[i_case]["score"]) / 100.0
            else:
                gpt_score = np.nan

            for m in open_metrics:
                case_dict[f"{m}_open"] = globals()[m]
        elif question_type == "closed":
            accuracy = 1 if answer_l in output_l else 0
            for m in closed_metrics:
                case_dict[f"{m}_closed"] = globals()[m]
            gpt_score = accuracy
        else:
            raise Exception(f"Unknown question type {question_type}")

        for m in overall_metrics:
            case_dict[f"{m}_overall"] = globals()[m]

        if question_type == "open":
            for m in closed_metrics:
                assert np.isnan(case_dict[f"{m}_closed"])
        else:
            for m in open_metrics:
                assert np.isnan(case_dict[f"{m}_open"])
            for m in closed_metrics:
                assert ~np.isnan(case_dict[f"{m}_closed"])

        all_results.append(case_dict)

    df_case_result = pd.DataFrame(all_results)
    df_case_result.to_csv(save_file, index=False)
    # break

 30%|██▉       | 26/88 [00:00<00:00, 115.60it/s]

100%|██████████| 88/88 [00:04<00:00, 21.67it/s] 


In [23]:
indexes = []

for s in gpt_scores:
    indexes.append(int(s["index"]))

for i in range(451):
    if i not in indexes:
        print(i)

32
97
111


In [39]:
df_case_result

Unnamed: 0,question_type,bleu1_open,bleu2_open,bleu3_open,bleu4_open,rouge1_open,rouge2_open,rougeL_open,exact_match_open,recall_open,...,gpt_score_open,exact_match_closed,recall_closed,precision_closed,f1_score_closed,accuracy_closed,exact_match_overall,recall_overall,precision_overall,f1_score_overall
0,open,0.444491,0.427712,0.407688,0.382983,0.521739,0.476190,0.521739,0.600000,0.500000,...,65,,,,,,0.600000,0.500000,0.700000,0.583333
1,closed,,,,,,,,,,...,,1.0,1.0,1.0,1.0,1.0,1.000000,1.000000,1.000000,1.000000
2,open,0.268128,0.211974,0.000000,0.000000,0.333333,0.200000,0.333333,0.400000,0.285714,...,25,,,,,,0.400000,0.285714,0.400000,0.333333
3,closed,,,,,,,,,,...,,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,open,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,100,,,,,,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3989,closed,,,,,,,,,,...,,1.0,1.0,1.0,1.0,1.0,1.000000,1.000000,1.000000,1.000000
3990,open,0.307692,0.226455,0.167054,0.000000,0.400000,0.222222,0.400000,0.307692,0.571429,...,45,,,,,,0.307692,0.571429,0.307692,0.400000
3991,closed,,,,,,,,,,...,,1.0,1.0,1.0,1.0,1.0,1.000000,1.000000,1.000000,1.000000
3992,open,0.625000,0.517549,0.446952,0.365555,0.714286,0.500000,0.714286,0.625000,0.833333,...,90,,,,,,0.625000,0.833333,0.625000,0.714286


In [8]:
case_dict

{'question_type': 'open',
 'bleu1_open': 0.05263158306479454,
 'bleu2_open': 0.0,
 'bleu3_open': 0.0,
 'bleu4_open': 0.0,
 'rouge1_open': 0.095238097012043,
 'rouge2_open': 0.0,
 'rougeL_open': 0.095238097012043,
 'exact_match_open': 0.05263157894736842,
 'recall_open': 1.0,
 'precision_open': 0.10526315789473684,
 'f1_score_open': 0.1904761904761905,
 'accuracy_open': 1.0,
 'bertscore_open': 0.7257148623466492,
 'meteor_open': 0.0,
 'gpt_score_open': nan,
 'exact_match_closed': nan,
 'recall_closed': nan,
 'precision_closed': nan,
 'f1_score_closed': nan,
 'accuracy_closed': nan,
 'exact_match_overall': 0.05263157894736842,
 'recall_overall': 1.0,
 'precision_overall': 0.10526315789473684,
 'f1_score_overall': 0.1904761904761905}

In [19]:
predictions

[{'image_path': '/research/d5/gds/yzhong22/datasets/SLAKE/imgs/xmlab102/source.jpg',
  'question_type': 'open',
  'qs': 'What modality is used to take this image?',
  'answer': 'CT',
  'prediction': "The image is a computed tomography (CT) scan. CT scans use X-rays to create detailed images of the body's internal structures."},
 {'image_path': '/research/d5/gds/yzhong22/datasets/SLAKE/imgs/xmlab102/source.jpg',
  'question_type': 'open',
  'qs': 'Which part of the body does this image belong to?',
  'answer': 'Chest',
  'prediction': 'This image is a cross-sectional view of the chest, typically obtained from a computed tomography (CT) scan. It shows the thoracic cavity, including the lungs, heart, and other structures within the chest.'},
 {'image_path': '/research/d5/gds/yzhong22/datasets/SLAKE/imgs/xmlab102/source.jpg',
  'question_type': 'open',
  'qs': 'What is the main organ in the image?',
  'answer': 'Lung, Spinal Cord',
  'prediction': 'The main organ in the image is the heart.