In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
# from torchmetrics.classification import MultilabelAUROC
from typing import TypeAlias, Union
import re


pathology_dict_type: TypeAlias = dict[set]

Dataset agnostic code

In [3]:
def find_pathologies_in_sentences(text, pathologies):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    detected_pathologies = set()

    negation_keywords = r"\bno\b|\bnot\b|\bwithout\b|\babsent\b"

    for sentence in sentences:
        lower_sentence = sentence.lower()
        pathologies_in_sentence = set()

        for pathology in pathologies:
            if pathology.lower() in lower_sentence:
                pathologies_in_sentence.add(pathology)

        # Check for negation in the sentence
        negation_match = re.search(negation_keywords, lower_sentence)

        # Extract pathologies in 'no xxx or yyy' construction
        if negation_match:
            ns = negation_match.end()
            to_end_sentence = lower_sentence[ns:]
            # Look for immediate pathologies after 'no' possibly linked by 'or'/'and'
            excluded_pathologies = [p for p in pathologies_in_sentence if p.lower() in to_end_sentence]

            # Remove the pathologies that are excluded by negation
            pathologies_in_sentence.difference_update(excluded_pathologies)

        detected_pathologies.update(pathologies_in_sentence)

    return detected_pathologies    

def build_image_id_to_pathology_dict(file_path, pathologies, header = False,pathologies_check = True, sentences = False ) -> pathology_dict_type:
    image_ids_to_pathologies = defaultdict(set)
    
    pathologies = [pathology.lower() for pathology in pathologies]
    with open(file_path, 'r') as f:
        if header:
            header = f.readline() # skip header
        for line in f.readlines():

            if not sentences:
                line = line.strip().split(',')
                for pathology in line[1:]:
                    if pathologies_check and pathology.lower() in pathologies:
                        image_ids_to_pathologies[line[0]].add(pathology.lower().strip())
                    else:
                        image_ids_to_pathologies[line[0]].add(pathology.lower().strip())
            
            else:
                image_id = line.split(',')[0]
                text = (',').join(line.split(',')[1:])
                detected_pathologies = find_pathologies_in_sentences(text, pathologies)
                if len(detected_pathologies) == 0:
                    detected_pathologies.add('no finding')
                image_ids_to_pathologies[image_id] = detected_pathologies
                
    return image_ids_to_pathologies

In [13]:
def normalize_labels(labels, synonym_mapping):
    normalized_labels = set()
    for label in labels:
        found = False
        for synonyms in synonym_mapping:
            if label in synonyms:
                normalized_labels.add(frozenset(synonyms))  # Adding the frozenset of synonyms
                found = True
                break
        if not found:
            normalized_labels.add(label)
    return normalized_labels


def contains_no_finding(label_set, synonym_mappings):
    for synonyms in synonym_mappings:
        if "no finding" in synonyms:
            if frozenset(synonyms) in label_set:
                return True
    return False


def convert_probabilities_to_labels(file_path: Path,pathologies, threshold = True, threshold_val = 0.5, top_k = False, k = 1) -> pathology_dict_type:
    image_ids_to_pathologies = defaultdict(set)
    pathologies = [pathology.lower() for pathology in pathologies]

    with open(file_path, 'r') as f:
        # header contains mapping of pathology index to pathology name
        header = f.readline()
        pathology_names = header.strip().split(',')[1:]
        for line in f.readlines():
            line = line.strip().split(',')
            image_id = line[0]
            probabilities = np.array([float(prob) for prob in line[1:]])
            for i, prob in enumerate(probabilities):
                pathology = pathology_names[i].lower()
                if pathology not in pathologies:
                    continue

                if threshold:
                    if prob >= threshold_val:
                        image_ids_to_pathologies[image_id].add(pathology)
                elif top_k:
                    if i in np.argsort(probabilities)[-k:]:
                        if probabilities[i] >= threshold_val: # threshold_val should be 0.5 when threshold flag is False
                            image_ids_to_pathologies[image_id].add(pathology)
                else:
                    return ValueError("Invalid arguments for convert_probabilities_to_labels")
            
            if len(image_ids_to_pathologies[image_id]) == 0:
                image_ids_to_pathologies[image_id].add("no finding")

    return image_ids_to_pathologies


def calculate_accuracy_metrics(gt_path: Path, preds: Union[Path, pathology_dict_type], gt_pathologies: set, pred_pathologies = None , synonym_mappings = [], threshold = True, threshold_val = 0.5, k = 1):
    """
    Pre-reqs: assume header of probability files will have the pathology names (i.e. image_id, pathology1, pathology2, ...)
    
    Args:
    gt_path: Path to the ground truth file with probabilities
    preds: Path to the predictions file with probabilities or a dictionary of image_id to pathologies
    gt_pathologies: Set of pathologies to consider in the ground truth, all others will be ignored
    pred_pathologies: Set of pathologies to consider in the predictions, all others will be ignored
    synonym_mappings: List of lists of synonyms for each pathology
    threshold: Flag to determine whether to apply a threshold to the probabilities
    threshold_val: Threshold value to apply to the probabilities
    k: Number of top k pathologies to consider (only if threshold is False)
    """
    
    ground_truth = convert_probabilities_to_labels(gt_path,pathologies = gt_pathologies)
    if pred_pathologies is None:
        pred_pathologies = gt_pathologies
    
    if isinstance(preds, Path):
        predictions = convert_probabilities_to_labels(preds, pathologies = pred_pathologies, threshold = threshold, threshold_val = threshold_val, top_k = True, k = k)
    else:
        predictions = preds # Since CheXagent does not give probability bounds on outputs
        
    # Normalize each set of pathologies in the ground truth and predictions using the provided synonym mappings
    normalized_ground_truth = {image_id: normalize_labels(gt_labels, synonym_mappings) for image_id, gt_labels in ground_truth.items()}
    normalized_predictions = {image_id: normalize_labels(pred_labels, synonym_mappings) for image_id, pred_labels in predictions.items()}

    exact_matches = 0
    exact_no_finding = 0
    exact_one_pathology = 0
    exact_multiple_pathologies = 0

    total_no_finding = 0
    total_one_pathology = 0
    total_multiple_pathologies = 0

    correct_matches = 0
    correct_no_finding = 0
    correct_pathology = 0

    correct_top_k_pathology_match = 0

    for image_id, ground_truth_labels in normalized_ground_truth.items():
        pred_labels = normalized_predictions.get(image_id, set())

        is_no_finding = "no finding" in ground_truth_labels or contains_no_finding(ground_truth_labels, synonym_mappings)
        if is_no_finding:
            total_no_finding += 1
        elif len(ground_truth_labels) == 1:
            total_one_pathology += 1
        else:  # Assuming any non-empty set of labels greater than 1 is 'multiple pathologies'
            total_multiple_pathologies += 1
        
        # Calculate exact matches (Metric 1)
        if ground_truth_labels == pred_labels:
            exact_matches += 1
            # Update metrics for matches (Metric 2)
            if "no finding" in ground_truth_labels or contains_no_finding(ground_truth_labels, synonym_mappings):
                exact_no_finding += 1
            elif len(ground_truth_labels) == 1:
                exact_one_pathology += 1
            else:
                exact_multiple_pathologies += 1
        
        # Calculate individual correct matches (Metric 3)
        matched_pathologies = ground_truth_labels.intersection(pred_labels)
        correct_matches += len(matched_pathologies)
        if len(matched_pathologies) > 0:
            correct_top_k_pathology_match += 1
        
        # Update metrics for matches (Metric 4)
        for pathology in matched_pathologies:
            if "no finding" in ground_truth_labels or contains_no_finding({pathology}, synonym_mappings):
                correct_no_finding += 1
            else:
                correct_pathology += 1
    
    # Calculate metrics
    n = len(ground_truth)
    exact_matches_percentage = exact_matches / n
    exact_no_finding_percentage = exact_no_finding / total_no_finding if total_no_finding > 0 else 0
    exact_one_pathology_percentage = exact_one_pathology / total_one_pathology if total_one_pathology > 0 else 0
    exact_multiple_pathologies_percentage = exact_multiple_pathologies / total_multiple_pathologies if total_multiple_pathologies > 0 else 0
    
    print(f"\n\nDataset Characteristics:")
    print(f"No finding proportion: {total_no_finding/n:.2f}")
    print(f"One pathology proportion: {total_one_pathology/n:.2f}")
    print(f"Multiple pathologies proportion: {total_multiple_pathologies/n:.2f}")
    
    print(f"\nExact Matches Characteristics:")
    print(f"Exact matches: {exact_matches_percentage:.2f}")
    print(f"Exact no finding: {exact_no_finding_percentage:.2f}")
    print(f"Exact one pathology: {exact_one_pathology_percentage:.2f}")
    print(f"Exact multiple pathologies: {exact_multiple_pathologies_percentage:.2f}")
    
    total_num_of_pathologies = sum([len(pathologies) for pathologies in ground_truth.values()])
    correct_matches_percentage = correct_matches / total_num_of_pathologies
    correct_no_finding_percentage = correct_no_finding / total_no_finding if total_no_finding > 0 else 0
    correct_pathology_percentage = correct_pathology / (total_num_of_pathologies - total_no_finding) if total_num_of_pathologies - total_no_finding > 0 else 0
    
    print(f"\nIndividual Pathology Characteristics:")
    print(f"Correct matches: {correct_matches_percentage:.2f}")
    print(f"Correct no finding: {correct_no_finding_percentage:.2f}")
    print(f"Correct pathology: {correct_pathology_percentage:.2f}")

    print(f"\nTop K Pathology Match Characteristics:")
    print(f"Top K Pathology Match: {correct_top_k_pathology_match/n:.2f}")

    return exact_matches_percentage, exact_no_finding_percentage, exact_one_pathology_percentage, exact_multiple_pathologies_percentage, correct_matches_percentage, correct_no_finding_percentage, correct_pathology_percentage


def calculate_auroc(ground_truth:Path, predictions:Path, average = "weighted", specific_pathologies = {}, remap_patholgies = {}):
    df_ground_truth = pd.read_csv(ground_truth)
    df_predictions = pd.read_csv(predictions)

    if len(remap_patholgies) > 0:
        # convert keys in remap_pathologies to their corresponding values in df_predictions
        df_predictions = df_predictions.rename(columns=remap_patholgies)

    # Sort both DataFrames based on 'image_id' to align them
    df_ground_truth = df_ground_truth.sort_values('image_id').reset_index(drop=True)
    df_predictions = df_predictions.sort_values('image_id').reset_index(drop=True)

    # Verify that the sorted DataFrames have aligned 'image_id'
    assert np.array_equal(df_ground_truth['image_id'], df_predictions['image_id']), "The image_ids are not aligned!"

    # Convert the ground_truth columns and predictions to tensors
    labels = df_ground_truth.iloc[:, 1:]  # Exclude the 'image_id' column
    predictions = df_predictions.iloc[:, 1:]  # Exclude the 'image_id' column

    if len(specific_pathologies) > 0:
        specific_pathologies = [pathology for pathology in specific_pathologies]
        # select only the specific pathologies columns
        labels = labels[specific_pathologies]
        predictions = predictions[specific_pathologies]

    assert all(labels.columns == predictions.columns), "Mismatch in columns between the labels and predictions dataframes"

    labels_tensor = torch.tensor(labels.values).int()
    predictions_tensor = torch.tensor(predictions.values).float()

    ml_auroc = MultilabelAUROC(num_labels=len(labels.columns),average=average,thresholds = None)
    calc_value = ml_auroc(predictions_tensor, labels_tensor)
    print(f"\nROC AUC: {calc_value}")
    return calc_value

CheXagent General

In [5]:
chexagent_synonym_mappings = [
    {'no finding', 'no pathologies'}, 
    {'enlarged cardiomediastinum', 'enlarged cardiac silhouette'},
    {'edema','pulmonary edema/hazy opacity'}
]

## CheXpert specific

In [6]:
# CheXpert paths
cheXpert_test_ground_truth_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/CheXpert/test.csv")

cheXpert_layer_norm_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/probe_results/chexpert_layer_norm_predictions.csv")
cheXpert_q_former_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/probe_results/chexpert_q_former_predictions.csv")
cheXpert_cheXagent_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/identify_pathologies")
cheXpert_cheXagent_half_temperature_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/chexpert_identify_pathologies_0.5")

# CheXpert pathologies
cheXpert_pathologies = ['No Finding','Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity',
        'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis','Pneumothorax',
        'Pleural Effusion','Pleural Other','Fracture','Support Devices']

### CheXpert Probes Accuracies

In [14]:
cheXpert_small_q_former_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/q_former_small")
threshold = False
threshold_val = 0.5
k = 1
results = []

for file in cheXpert_small_q_former_folder.iterdir():
    if file.is_file():
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path, file, cheXpert_pathologies,threshold=threshold, threshold_val=threshold_val, k=k))

print(all(x == results[0] for x in results))



Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.31
Exact no finding: 0.90
Exact one pathology: 0.48
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.28
Correct no finding: 0.90
Correct pathology: 0.21

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.72


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.31
Exact no finding: 0.90
Exact one pathology: 0.48
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.28
Correct no finding: 0.90
Correct pathology: 0.21

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.72


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exa

In [21]:
chexpert_layer_norm_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/layer_norm")
chexpert_q_former_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/q_former")

results = []
threshold = False
threshold_val = 0.7
k = 1
layer_norm = False

if layer_norm:
    print("CheXpert Layer Norm Probe")
    for file in chexpert_layer_norm_folder.iterdir():
        if file.is_file():
            results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path, file, cheXpert_pathologies, threshold = threshold, threshold_val = threshold_val, k = k))
else:
    print("CheXpert Q Former Probe")
    for file in chexpert_q_former_folder.iterdir():
        if file.is_file():
            results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path, file, cheXpert_pathologies,threshold = threshold, threshold_val = threshold_val, k=k))

print(all(x == results[0] for x in results))        


CheXpert Q Former Probe


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.30
Exact no finding: 0.96
Exact one pathology: 0.36
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.25
Correct no finding: 0.96
Correct pathology: 0.17

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.64


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.30
Exact no finding: 0.96
Exact one pathology: 0.36
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.25
Correct no finding: 0.96
Correct pathology: 0.17

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.64


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologi

In [7]:
cheXpert_test_ground_dict = build_image_id_to_pathology_dict(cheXpert_test_ground_truth_path,cheXpert_pathologies, header = False)

cheXpert_cheXagent_predictions = build_image_id_to_pathology_dict(cheXpert_cheXagent_predictions_path,cheXpert_pathologies, header = False)
cheXpert_cheXagent_half_temperature_predictions = build_image_id_to_pathology_dict(cheXpert_cheXagent_half_temperature_predictions_path,cheXpert_pathologies, header = False)


### CheXpert Probe ROC-AUC

In [23]:
layer_norm_probabilities_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/layer_norm/chexpert_layer_norm_predictions_probabilities_0.csv")
q_former_probabilities_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/q_former/chexpert_q_former_predictions_probabilities_0.csv")
q_former_small_probabilities_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/q_former_small/chexpert_small_q_former_predictions_probabilities_0.csv")
print("CheXpert Layer Norm:")
_ = calculate_auroc(cheXpert_test_ground_truth_path, layer_norm_probabilities_path)
print("CheXpert Q-Former:")
_ = calculate_auroc(cheXpert_test_ground_truth_path, q_former_probabilities_path)
print("CheXpert Q-Former Small:")
_ = calculate_auroc(cheXpert_test_ground_truth_path, q_former_small_probabilities_path)

CheXpert Layer Norm:

ROC AUC: 0.8526307940483093
CheXpert Q-Former:

ROC AUC: 0.8319408893585205
CheXpert Q-Former Small:

ROC AUC: 0.8319401144981384


### CheXpert XRV models Accuracies

In [23]:
cheXpert_xrv_224_chex_folder_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/xrvs/res224-chex")
cheXpert_xrv_224_folder_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/xrvs/res224-all")
cheXpert_xrv_512_folder_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/xrvs/res512-all")

results = []
threshold = False
threshold_val = 0.6
k = 1

print("CheXagent XRV 224 :")
for file in cheXpert_xrv_512_folder_path.iterdir():
    if file.is_file():
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path, file, cheXpert_pathologies, threshold = threshold, threshold_val = threshold_val, k = k))
print(all(x == results[0] for x in results))

CheXagent XRV 224 :


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.24
Exact no finding: 0.98
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.11
Correct no finding: 0.98
Correct pathology: 0.01

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.27


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.24
Exact no finding: 0.98
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.11
Correct no finding: 0.98
Correct pathology: 0.01

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.27


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies p

### CheXpert XRV models ROC-AUC

In [10]:
xrv_chexpert_specific_pathologies = {"Atelectasis","Consolidation","Pneumothorax","Edema","Pleural Effusion","Pneumonia","Cardiomegaly","Lung Lesion","Fracture","Lung Opacity","Enlarged Cardiomediastinum"}
cheXpert_xrv_224 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/xrvs/res224-all/densenet121-res224-all_test_0.csv")
cheXpert_xrv_224_chex = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/xrvs/res224-chex/densenet121-res224-chex_test_0.csv")
cheXpert_xrv_512 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/xrvs/res512-all/resnet50-res512-all_test_0.csv")

print("CheXagent XRV 224")
_ = calculate_auroc(cheXpert_test_ground_truth_path, cheXpert_xrv_224, specific_pathologies = xrv_chexpert_specific_pathologies)

print("CheXagent XRV 224 Chex:")
_ = calculate_auroc(cheXpert_test_ground_truth_path, cheXpert_xrv_224_chex, specific_pathologies = xrv_chexpert_specific_pathologies)

print("CheXagent XRV 512:")
_ = calculate_auroc(cheXpert_test_ground_truth_path, cheXpert_xrv_512, specific_pathologies = xrv_chexpert_specific_pathologies)


CheXagent XRV 224

ROC AUC: 0.841437578201294
CheXagent XRV 224 Chex:

ROC AUC: 0.815412700176239
CheXagent XRV 512:

ROC AUC: 0.6200760006904602


### CheXpert CheXagent

#### CheXpert CheXagent Temp 1

In [11]:
chexagent_responses_pathologies_temp_1_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_1/pathologies")
chexagent_responses_findings_temp_1_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_1/findings")
chexagent_responses_abnormalities_temp_1_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_1/abnormalities")

In [12]:
print("CheXagent Pathologies (Temp 1):")

results = []

for file in chexagent_responses_pathologies_temp_1_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = True)
        result = calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings)
        results.append(result)

print(all(x == results[0] for x in results))

CheXagent Pathologies (Temp 1):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.12
Exact no finding: 0.43
Exact one pathology: 0.06
Exact multiple pathologies: 0.01

Individual Pathology Characteristics:
Correct matches: 0.45
Correct no finding: 0.43
Correct pathology: 0.46


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.12
Exact no finding: 0.43
Exact one pathology: 0.06
Exact multiple pathologies: 0.01

Individual Pathology Characteristics:
Correct matches: 0.45
Correct no finding: 0.43
Correct pathology: 0.46


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.12
Exact no finding: 0.43
Exact one pathology: 0.06
Exact m

In [13]:
print("CheXagent Findings (Temp 1):")

results = []

for file in chexagent_responses_findings_temp_1_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = True)
        result = calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings)
        results.append(result)
        
print(all(x == results[0] for x in results))

CheXagent Findings (Temp 1):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.07
Correct no finding: 0.00
Correct pathology: 0.08


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.07
Correct no finding: 0.00
Correct pathology: 0.08


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact mult

In [14]:
print("CheXagent Abnormalities (Temp 1):")

results = []

for file in chexagent_responses_abnormalities_temp_1_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = True)
        result = calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings)
        results.append(result)
        
print(all(x == results[0] for x in results))

CheXagent Abnormalities (Temp 1):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.08
Exact no finding: 0.18
Exact one pathology: 0.12
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.44
Correct no finding: 0.18
Correct pathology: 0.47


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.08
Exact no finding: 0.18
Exact one pathology: 0.12
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.44
Correct no finding: 0.18
Correct pathology: 0.47


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.08
Exact no finding: 0.18
Exact one pathology: 0.12
Exact

#### CheXpert CheXagent Temp 0.5

In [15]:
chexagent_responses_pathologies_temp_0_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_0.5/pathologies")
chexagent_responses_findings_temp_0_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_0.5/findings")
chexagent_responses_abnormalities_temp_0_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_0.5/abnormalities")

print("CheXagent Pathologies (Temp 0.5):")
results = []
for file in chexagent_responses_pathologies_temp_0_5_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = False)
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

CheXagent Pathologies (Temp 0.5):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.16
Exact no finding: 0.55
Exact one pathology: 0.04
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.39
Correct no finding: 0.55
Correct pathology: 0.38


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.16
Exact no finding: 0.55
Exact one pathology: 0.03
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.39
Correct no finding: 0.55
Correct pathology: 0.38


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.16
Exact no finding: 0.56
Exact one pathology: 0.03
Exact

In [16]:
print("CheXagent Findings (Temp 0.5):")
results = []
for file in chexagent_responses_findings_temp_0_5_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = False)
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))


CheXagent Findings (Temp 0.5):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.02
Correct no finding: 0.00
Correct pathology: 0.02


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.02
Correct no finding: 0.00
Correct pathology: 0.02


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact mu

In [17]:
print("CheXagent Abnormalities (Temp 0.5):")   
results = []
for file in chexagent_responses_abnormalities_temp_0_5_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = False)
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

CheXagent Abnormalities (Temp 0.5):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.10
Exact no finding: 0.27
Exact one pathology: 0.13
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.39
Correct no finding: 0.27
Correct pathology: 0.40


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.11
Exact no finding: 0.27
Exact one pathology: 0.14
Exact multiple pathologies: 0.03

Individual Pathology Characteristics:
Correct matches: 0.39
Correct no finding: 0.27
Correct pathology: 0.41


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.11
Exact no finding: 0.28
Exact one pathology: 0.13
Exa

#### CheXpert CheXagent Temp 1.5

In [18]:
chexagent_responses_pathologies_temp_1_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_1.5/pathologies")
chexagent_responses_findings_temp_1_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_1.5/findings")
chexagent_responses_abnormalities_temp_1_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/chexagent/temperature_1.5/abnormalities")

print("CheXagent Pathologies (Temp 1.5):")  
results = []
for file in chexagent_responses_pathologies_temp_1_5_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = False)
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

CheXagent Pathologies (Temp 1.5):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.45
Exact one pathology: 0.05
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.43
Correct no finding: 0.45
Correct pathology: 0.43


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.48
Exact one pathology: 0.04
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.44
Correct no finding: 0.48
Correct pathology: 0.44


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.47
Exact one pathology: 0.05
Exact

In [19]:
print("CheXagent Findings (Temp 1.5):")
results = []
for file in chexagent_responses_findings_temp_1_5_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = False)
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

CheXagent Findings (Temp 1.5):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.03
Correct no finding: 0.00
Correct pathology: 0.03


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.03
Correct no finding: 0.00
Correct pathology: 0.03


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.01
Exact mu

In [20]:
print("CheXagent Abnormalities (Temp 1.5):")
results = []
for file in chexagent_responses_abnormalities_temp_1_5_folder.iterdir():
    if file.is_file():
        chexagent_responses_dict = build_image_id_to_pathology_dict(file,cheXpert_pathologies, header = False)
        results.append(calculate_accuracy_metrics(cheXpert_test_ground_truth_path,chexagent_responses_dict, cheXpert_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

CheXagent Abnormalities (Temp 1.5):


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.06
Exact no finding: 0.17
Exact one pathology: 0.07
Exact multiple pathologies: 0.01

Individual Pathology Characteristics:
Correct matches: 0.42
Correct no finding: 0.17
Correct pathology: 0.45


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.08
Exact no finding: 0.24
Exact one pathology: 0.09
Exact multiple pathologies: 0.01

Individual Pathology Characteristics:
Correct matches: 0.44
Correct no finding: 0.24
Correct pathology: 0.46


Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.08
Exact no finding: 0.22
Exact one pathology: 0.08
Exa

### VinDr Probe on CheXpert using ROC-AUC

In [26]:
# overlapping pathologies between CheXpert and VinDr-CXR
chexpert_vindr_overlapping_pathologies = ['Atelectasis','Cardiomegaly','Consolidation','Lung Opacity','No Finding','Pleural Effusion','Pneumothorax']
chexpert_predictions_by_vindr_layer_norm = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/chexpert_predictions_by_vindr_layer_norm_model.csv")
chexpert_small_predictions_by_vindr_layer_norm = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXpert/probes/chexpert_small_predictions_by_vindr_layer_norm_model.csv")

_ = calculate_auroc(cheXpert_test_ground_truth_path, chexpert_predictions_by_vindr_layer_norm, specific_pathologies = chexpert_vindr_overlapping_pathologies)
_ = calculate_auroc(cheXpert_test_ground_truth_path, chexpert_small_predictions_by_vindr_layer_norm, specific_pathologies = chexpert_vindr_overlapping_pathologies)


ROC AUC: 0.858841061592102

ROC AUC: 0.858841061592102


## VinDr specific

In [24]:
#VinDr paths
vinDr_test_ground_truth_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/test_set_three_splits/VinDr_test_test_split_with_one_hot_labels.csv")
vinDr_chexagent_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/CheXagent/test_split_identify_pathologies")
vinDr_chexagent_predictions_half_temperature_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/CheXagent/test_split_identify_pathologies_0.5")

vindr_pathologies = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly",
            "Clavicle fracture", "Consolidation", "Emphysema", "Enlarged PA",
            "ILD", "Infiltration", "Lung Opacity", "Lung cavity", "Lung cyst",
            "Mediastinal shift","Nodule/Mass", "Pleural effusion", "Pleural thickening",
            "Pneumothorax", "Pulmonary fibrosis","Rib fracture", "Other lesion",
            "No finding"] 


### VinDr Probes Accuracies

In [26]:
vindr_layer_norm_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/probes/layer_norm")
vindr_q_former_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/probes/q_former")

results = []
threshold = False
threshold_val = 0.4
k = 1
layer_norm = False

if layer_norm:
    print("VinDr Layer Norm Probe")
    for file in vindr_layer_norm_folder.iterdir():
        if file.is_file():
            results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path, file, vindr_pathologies, threshold = threshold, threshold_val = threshold_val, k = k))
else:
    print("Q Former Probe")
    for file in vindr_q_former_folder.iterdir():
        if file.is_file():
            results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path, file, vindr_pathologies,threshold = threshold, threshold_val = threshold_val, k=k))

print(all(x == results[0] for x in results))

Q Former Probe


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.73
Exact no finding: 0.97
Exact one pathology: 0.52
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.63
Correct no finding: 0.97
Correct pathology: 0.28

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.86


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.73
Exact no finding: 0.97
Exact one pathology: 0.52
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.63
Correct no finding: 0.97
Correct pathology: 0.28

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.86


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies propor

### VinDr Probe ROC-AUC

In [23]:
vindr_layer_norm_probabilities_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/probes/layer_norm/vindr_layer_norm_predictions_probabilities_0.csv")
vindr_q_former_probabilities_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/probes/q_former/vindr_q_former_predictions_probabilities_0.csv")
print("VinDr Layer Norm:")
_ = calculate_auroc(vinDr_test_ground_truth_path, vindr_layer_norm_probabilities_path)
print("VinDr Q-Former:")
_ = calculate_auroc(vinDr_test_ground_truth_path, vindr_q_former_probabilities_path)


VinDr Layer Norm:

ROC AUC: 0.9575596451759338
VinDr Q-Former:

ROC AUC: 0.9631940126419067




### VinDr XRV models Accuracies

In [28]:
vindr_xrv_224_folder_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/xrvs/res224-all")
vindr_xrv_512_folder_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/xrvs/res512-all")

xrv_vindr_pathologies = ["Atelectasis","Cardiomegaly","Consolidation","Pleural effusion","Emphysema","Infiltration","Lung Opacity","Pneumothorax"]

results = []
threshold = False
threshold_val = 0.5
k = 1

# for file in vindr_xrv_224_folder_path.iterdir():
#     if file.is_file():
#         results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path, file, vindr_pathologies,xrv_vindr_pathologies, threshold = threshold, threshold_val = threshold_val, k = k))

for file in vindr_xrv_512_folder_path.iterdir():
    if file.is_file():
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path, file, vindr_pathologies,xrv_vindr_pathologies, threshold = threshold, threshold_val = threshold_val, k = k))

print(all(x == results[0] for x in results))



Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.68
Exact no finding: 0.99
Exact one pathology: 0.04
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.50
Correct no finding: 0.99
Correct pathology: 0.02

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.68


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.68
Exact no finding: 0.99
Exact one pathology: 0.04
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.50
Correct no finding: 0.99
Correct pathology: 0.02

Top K Pathology Match Characteristics:
Top K Pathology Match: 0.68


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exa

### VinDr XRV models ROC-AUC

In [25]:
vindr_xrv_224 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/xrvs/res224-all/densenet121-res224-all_test_0.csv")
vindr_xrv_512 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/xrvs/res512-all/resnet50-res512-all_test_0.csv")

print("CheXagent XRV 224")
_ = calculate_auroc(vinDr_test_ground_truth_path, vindr_xrv_224, specific_pathologies = xrv_vindr_pathologies)

print("CheXagent XRV 512:")
_ = calculate_auroc(vinDr_test_ground_truth_path, vindr_xrv_512, specific_pathologies = xrv_vindr_pathologies)


CheXagent XRV 224

ROC AUC: 0.9120920896530151
CheXagent XRV 512:

ROC AUC: 0.8694754242897034


### VinDr CheXagent

#### VinDr CheXagent Temp 1

In [26]:
vindr_chexagent_responses_pathologies_temp_1_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_1/pathologies")
vindr_chexagent_responses_findings_temp_1_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_1/findings")
vindr_chexagent_responses_abnormalities_temp_1_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_1/abnormalities")

In [27]:
print("VinDr Chexagent Pathologies (Temp 1):")
results = []

for file in vindr_chexagent_responses_pathologies_temp_1_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = True)
        result = calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings)
        results.append(result)

print(all(x == results[0] for x in results))

VinDr Chexagent Pathologies (Temp 1):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.19
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.15
Correct no finding: 0.19
Correct pathology: 0.12




Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.19
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.15
Correct no finding: 0.19
Correct pathology: 0.12


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.19
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.15
Correct no finding: 0.19
Correct pathology: 0.12
True


In [28]:
print("VinDr Chexagent Findings (Temp 1):")
results = []

for file in vindr_chexagent_responses_findings_temp_1_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = True)
        result = calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings)
        results.append(result)

print(all(x == results[0] for x in results))

VinDr Chexagent Findings (Temp 1):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.07
Correct no finding: 0.00
Correct pathology: 0.14


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.07
Correct no finding: 0.00
Correct pathology: 0.14


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exac

In [29]:
print("VinDr Chexagent Abnormalities (Temp 1):")
results = []

for file in vindr_chexagent_responses_abnormalities_temp_1_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = True)
        result = calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings)
        results.append(result)

print(all(x == results[0] for x in results))

VinDr Chexagent Abnormalities (Temp 1):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.04
Exact no finding: 0.06
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.08
Correct no finding: 0.06
Correct pathology: 0.10


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.04
Exact no finding: 0.06
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.08
Correct no finding: 0.06
Correct pathology: 0.10


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.04
Exact no finding: 0.06
Exact one pathology: 0.00

#### VinDr CheXagent Temp 0.5 (no header)

In [30]:
vindr_chexagent_responses_pathologies_temp_0_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_0.5/pathologies")
vindr_chexagent_responses_findings_temp_0_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_0.5/findings")
vindr_chexagent_responses_abnormalities_temp_0_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_0.5/abnormalities")

In [31]:
print("VinDr Chexagent Pathologies (Temp 0.5):")
results = []
for file in vindr_chexagent_responses_pathologies_temp_0_5_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = False)
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))


VinDr Chexagent Pathologies (Temp 0.5):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.25
Exact no finding: 0.37
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.24
Correct no finding: 0.37
Correct pathology: 0.10


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.26
Exact no finding: 0.37
Exact one pathology: 0.04
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.24
Correct no finding: 0.37
Correct pathology: 0.11


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.25
Exact no finding: 0.36
Exact one pathology: 0.02

In [32]:
print("VinDr Chexagent Findings (Temp 0.5):")
results = []
for file in vindr_chexagent_responses_findings_temp_0_5_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = False)
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

VinDr Chexagent Findings (Temp 0.5):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.05
Correct no finding: 0.00
Correct pathology: 0.09


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.04
Correct no finding: 0.00
Correct pathology: 0.08


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Ex

In [33]:
print("VinDr Chexagent Abnormalities (Temp 0.5):")
results = []
for file in vindr_chexagent_responses_abnormalities_temp_0_5_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = False)
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

VinDr Chexagent Abnormalities (Temp 0.5):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.07
Exact no finding: 0.10
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.09
Correct no finding: 0.10
Correct pathology: 0.08


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.09
Exact no finding: 0.12
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.10
Correct no finding: 0.12
Correct pathology: 0.08


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.07
Exact no finding: 0.10
Exact one pathology: 0.

#### VinDr CheXagent Temp 1.5 (no header)

In [34]:
vindr_chexagent_responses_pathologies_temp_1_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_1.5/pathologies")
vindr_chexagent_responses_findings_temp_1_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_1.5/findings")
vindr_chexagent_responses_abnormalities_temp_1_5_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/chexagent/temperature_1.5/abnormalities")

In [35]:
print("VinDr Chexagent Pathologies (Temp 1.5):")
results = []
for file in vindr_chexagent_responses_pathologies_temp_1_5_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = False)
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

VinDr Chexagent Pathologies (Temp 1.5):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.17
Exact no finding: 0.25
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.18
Correct no finding: 0.25
Correct pathology: 0.12




Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.18
Exact no finding: 0.26
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.18
Correct no finding: 0.26
Correct pathology: 0.10


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.18
Exact no finding: 0.25
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.19
Correct no finding: 0.25
Correct pathology: 0.12
False


In [36]:
print("VinDr Chexagent Findings (Temp 1.5):")
results = []
for file in vindr_chexagent_responses_findings_temp_1_5_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = False)
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

VinDr Chexagent Findings (Temp 1.5):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.03
Correct no finding: 0.00
Correct pathology: 0.06


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.02
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.04
Correct no finding: 0.00
Correct pathology: 0.08


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.00
Exact no finding: 0.00
Exact one pathology: 0.00
Ex

In [37]:
print("VinDr Chexagent Abnormalities (Temp 1.5):")
results = []
for file in vindr_chexagent_responses_abnormalities_temp_1_5_folder.iterdir():
    if file.is_file():
        vindr_chexagent_responses_dict = build_image_id_to_pathology_dict(file,vindr_pathologies, header = False)
        results.append(calculate_accuracy_metrics(vinDr_test_ground_truth_path,vindr_chexagent_responses_dict, vindr_pathologies, synonym_mappings = chexagent_synonym_mappings))

print(all(x == results[0] for x in results))

VinDr Chexagent Abnormalities (Temp 1.5):


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.07
Exact no finding: 0.10
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.10
Correct no finding: 0.10
Correct pathology: 0.11


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.05
Exact no finding: 0.08
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.09
Correct no finding: 0.08
Correct pathology: 0.11


Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.12
Multiple pathologies proportion: 0.20

Exact Matches Characteristics:
Exact matches: 0.04
Exact no finding: 0.06
Exact one pathology: 0.

### CheXpert Probe on Vindr using ROC-AUC

In [12]:
# overlapping pathologies between CheXpert and VinDr-CXR
chexpert_vindr_overlapping_pathologies = ['Atelectasis','Cardiomegaly','Consolidation','Lung Opacity','No finding','Pleural effusion','Pneumothorax']
vindr_predictions_by_chexpert_layer_norm = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/probes/vindr_predictions_by_chexpert_layer_norm_probe.csv")

_ = calculate_auroc(vinDr_test_ground_truth_path, vindr_predictions_by_chexpert_layer_norm, specific_pathologies = chexpert_vindr_overlapping_pathologies)


ROC AUC: 0.9115731120109558


In [None]:
# print("VinDr by CheXpert Layer Norm on intersection of pathologies:")
# _ = calculate_accuracy_metrics_prev(vinDr_test_ground_truth, vinDr_by_chexpert_layer_norm, ignore_pathologies = ignore_pathologies)
# synonym_mappings_vindr_chexpert = [
#     {'enlarged cardiomediastinum' , 'aortic enlargement'},
#     {'fracture', 'clavicle fracture', 'rib fracture'},
#     {'pleural other', 'pleural thickening'}   
# ]
# # update the ignore pathologies to exclude the synonyms
# ignore_pathologies_without_synonyms = ignore_pathologies - set.union(*[set(synonyms) for synonyms in synonym_mappings_vindr_chexpert])

# print("VinDr by CheXpert Layer Norm on intersection of pathologies with synonyms:")
# _ = calculate_accuracy_metrics_prev(vinDr_test_ground_truth, vinDr_by_chexpert_layer_norm, ignore_pathologies = ignore_pathologies_without_synonyms, synonym_mappings = synonym_mappings_vindr_chexpert)


## VinDr Image Text Reasoning

In [8]:
vindr_itr_test_gt_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/evaluation_datasets/VinDr/image_text_reasoning_datasets/test_pathology_left_or_right")
vindr_image_id_to_pathology_to_side = defaultdict(dict)
with open(vindr_itr_test_gt_path, 'r') as f:
    for line in f:
        image_id, pathology, side = line.strip().split(',')
        vindr_image_id_to_pathology_to_side[image_id][pathology.lower()] = side.strip().lower()

chexagent_untuned_itr_predictions_folder = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/VinDr/left_or_right/no_tuning_4934e91451945c8218c267aae9c34929a7677829/test_VinDr_pathology_only")

ignore_pathologies_lateral_locations = {"cardiomegaly","aortic enlargement"}

for file in chexagent_untuned_itr_predictions_folder.iterdir():
    total_scans = 0 
    if file.is_file():
        with open(file, 'r') as f:
            correct = 0
            for index, line in enumerate(f):
                if index == 0:
                    continue    
                
                image_id = line.strip().split(',')[0]
                pathology = line.strip().split(',')[1].lower()
                lateral_location = line.strip().split(',')[2].lower().strip()
                if pathology in ignore_pathologies_lateral_locations:
                    continue
                
                if vindr_image_id_to_pathology_to_side[image_id][pathology] == lateral_location:
                    correct += 1

                total_scans += 1
            print(f"Accuracy for {file.name}: {correct/total_scans:.2f}")
            print(f"Total scans for {file.name}: {total_scans}")


Accuracy for RIGHT_or_LEFT_responses: 0.45
Total scans for RIGHT_or_LEFT_responses: 1573
Accuracy for LEFT_or_RIGHT_responses: 0.53
Total scans for LEFT_or_RIGHT_responses: 1573
Accuracy for clearer_LEFT_or_RIGHT_GUIDED_PROMPT_responses: 0.50
Total scans for clearer_LEFT_or_RIGHT_GUIDED_PROMPT_responses: 1573
Accuracy for LEFT_or_RIGHT_GUIDED_PROMPT_responses: 0.51
Total scans for LEFT_or_RIGHT_GUIDED_PROMPT_responses: 1573


## CheXbench Evaluations 

In [7]:
def eval_cheXbench_image_text_reasoning(gt_path, pred_path, gt_header = True, pred_header = True):
    image_text_reasoning_predictions = build_image_id_to_pathology_dict(pred_path, cheXpert_pathologies, header = gt_header, pathologies_check= False)

    total_left_vs_right = 0
    total_mild_vs_severe = 0
    total_upper_vs_lower = 0

    correct_left_vs_right = 0
    correct_mild_vs_severe = 0
    correct_upper_vs_lower = 0

    with open(gt_path, 'r') as f:
        if gt_header:
            f.readline() # skip header

        for line in f.readlines():
            components = line.split(',')
            image_id = components[2]
            correct_option = components[4]
            option_0 = components[5]
            option_1 = components[6]

            gt = option_0 if correct_option == 0 else option_1

            possibilities = image_text_reasoning_predictions.get(image_id)
            correct = any([gt in possibility for possibility in possibilities])

            if ("left" in option_0 and "right" in option_1) or ("right" in option_0 and "left" in option_1):
                total_left_vs_right += 1
                correct_left_vs_right = correct_left_vs_right + 1 if correct else correct_left_vs_right

            elif ("mild" in option_0 and "severe" in option_1) or ("severe" in option_0 and "mild" in option_1):
                total_mild_vs_severe += 1
                correct_mild_vs_severe = correct_mild_vs_severe + 1 if correct else correct_mild_vs_severe
            
            else:
                total_upper_vs_lower += 1
                correct_upper_vs_lower = correct_upper_vs_lower + 1 if correct else correct_upper_vs_lower
            
    print(f"Image Text Reasoning Task: {str(pred_path).split('/')[-3:]}")

    left_vs_right_accuracy = correct_left_vs_right/total_left_vs_right
    mild_vs_severe_accuracy = correct_mild_vs_severe/total_mild_vs_severe
    upper_vs_lower_accuracy = correct_upper_vs_lower/total_upper_vs_lower
    overall_accuracy = (correct_left_vs_right + correct_mild_vs_severe + correct_upper_vs_lower)/(total_left_vs_right + total_mild_vs_severe + total_upper_vs_lower)

    print(f"Left vs Right: {left_vs_right_accuracy:.2f}")
    print(f"Mild vs Severe: {mild_vs_severe_accuracy:.2f}")
    print(f"Upper vs Lower: {upper_vs_lower_accuracy:.2f}")
    
    print(f"Overall: {overall_accuracy:.2f}")
    return left_vs_right_accuracy, mild_vs_severe_accuracy, upper_vs_lower_accuracy, overall_accuracy

In [13]:
image_text_reasoning_evaluation_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXbench/openi_itr_responses_answer_guidance")
image_text_reasoning_evaluation_path_unguided = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXbench/openi_itr_responses_unguided")
image_text_reasoning_gt = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/CheXbench/image_text_reasoning_task')


image_text_reasoning_unguided_evaluation_folder_0_5 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXbench/temperature_0.5/open_itr_unguided_responses")
image_text_reasoning_unguided_evaluation_folder_1 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXbench/temperature_1/openi_itr_unguided_responses")
image_text_reasoning_unguided_evaluation_folder_1_5 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/cheXagent/evaluation/CheXbench/temperature_1.5/open_itr_unguided_responses")

results = []
for file in image_text_reasoning_unguided_evaluation_folder_1_5.iterdir():
    if file.is_file():  
        results.append(eval_cheXbench_image_text_reasoning(image_text_reasoning_gt, file, gt_header = True, pred_header = True))

print(all(x == results[0] for x in results))

# eval_cheXbench_image_text_reasoning(image_text_reasoning_gt, image_text_reasoning_evaluation_path)
# eval_cheXbench_image_text_reasoning(image_text_reasoning_gt, image_text_reasoning_evaluation_path_unguided)

Image Text Reasoning Task: ['temperature_1.5', 'open_itr_unguided_responses', 'openi_itr_unguided_responses_0']
Left vs Right: 0.53
Mild vs Severe: 0.64
Upper vs Lower: 0.66
Overall: 0.56
Image Text Reasoning Task: ['temperature_1.5', 'open_itr_unguided_responses', 'openi_itr_unguided_responses_2']
Left vs Right: 0.54
Mild vs Severe: 0.64
Upper vs Lower: 0.66
Overall: 0.57
Image Text Reasoning Task: ['temperature_1.5', 'open_itr_unguided_responses', 'openi_itr_unguided_responses_1']
Left vs Right: 0.53
Mild vs Severe: 0.67
Upper vs Lower: 0.62
Overall: 0.56
False
