In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict

Dataset agnostic code

In [15]:
def build_image_id_to_pathology_dict(file_path, pathologies, gt_header = False,pathologies_check = True ):
    image_ids_to_pathologies = defaultdict(set)
    pathologies = [pathology.lower() for pathology in pathologies]
    with open(file_path, 'r') as f:
        if gt_header:
            header = f.readline() # skip header
        for line in f.readlines():
            line = line.strip().split(',')

            for pathology in line[1:]:
                if pathologies_check and pathology.lower() in pathologies:
                    image_ids_to_pathologies[line[0]].add(pathology.lower())
                else:
                    image_ids_to_pathologies[line[0]].add(pathology.lower())

    return image_ids_to_pathologies

In [3]:
def normalize_labels(labels, synonym_mapping):
    normalized_labels = set()
    for label in labels:
        found = False
        for synonyms in synonym_mapping:
            if label in synonyms:
                normalized_labels.add(frozenset(synonyms))  # Adding the frozenset of synonyms
                found = True
                break
        if not found:
            normalized_labels.add(label)
    return normalized_labels

def contains_no_finding(label_set, synonym_mappings):
    for synonyms in synonym_mappings:
        if "no finding" in synonyms:
            if frozenset(synonyms) in label_set:
                return True
    return False

def calculate_metrics(ground_truth, predictions, synonym_mappings = [], ignore_pathologies = []):

    if len(ignore_pathologies) > 0:
        # remove the pathologies to ignore from the ground truth and predictions
        for image_id in ground_truth:
            ground_truth[image_id] = ground_truth[image_id] - set(ignore_pathologies)
        for image_id in predictions:
            predictions[image_id] = predictions[image_id] - set(ignore_pathologies)

    # Normalize each set of pathologies in the ground truth and predictions using the provided synonym mappings
    normalized_ground_truth = {image_id: normalize_labels(gt_labels, synonym_mappings) for image_id, gt_labels in ground_truth.items()}
    normalized_predictions = {image_id: normalize_labels(pred_labels, synonym_mappings) for image_id, pred_labels in predictions.items()}

    exact_matches = 0
    exact_no_finding = 0
    exact_one_pathology = 0
    exact_multiple_pathologies = 0

    total_no_finding = 0
    total_one_pathology = 0
    total_multiple_pathologies = 0


    correct_matches = 0
    correct_no_finding = 0
    correct_pathology = 0

    for image_id, ground_truth_labels in normalized_ground_truth.items():
        pred_labels = normalized_predictions.get(image_id, set())

        is_no_finding = "no finding" in ground_truth_labels or contains_no_finding(ground_truth_labels, synonym_mappings)
        if is_no_finding:
            total_no_finding += 1
        elif len(ground_truth_labels) == 1:
            total_one_pathology += 1
        else:  # Assuming any non-empty set of labels greater than 1 is 'multiple pathologies'
            total_multiple_pathologies += 1
        
        # Calculate exact matches (Metric 1)
        if ground_truth_labels == pred_labels:
            exact_matches += 1
            # Update metrics for matches (Metric 2)
            if "no finding" in ground_truth_labels or contains_no_finding(ground_truth_labels, synonym_mappings):
                exact_no_finding += 1
            elif len(ground_truth_labels) == 1:
                exact_one_pathology += 1
            else:
                exact_multiple_pathologies += 1
        
        # Calculate individual correct matches (Metric 3)
        matched_pathologies = ground_truth_labels.intersection(pred_labels)
        correct_matches += len(matched_pathologies)
        
        # Update metrics for matches (Metric 4)
        for pathology in matched_pathologies:
            if "no finding" in ground_truth_labels or contains_no_finding({pathology}, synonym_mappings):
                correct_no_finding += 1
            else:
                correct_pathology += 1
    
    # Calculate metrics
    n = len(ground_truth)
    exact_matches_percentage = exact_matches / n
    exact_no_finding_percentage = exact_no_finding / total_no_finding if total_no_finding > 0 else 0
    exact_one_pathology_percentage = exact_one_pathology / total_one_pathology if total_one_pathology > 0 else 0
    exact_multiple_pathologies_percentage = exact_multiple_pathologies / total_multiple_pathologies if total_multiple_pathologies > 0 else 0
    
    print(f"Dataset Characteristics:")
    print(f"No finding proportion: {total_no_finding/n:.2f}")
    print(f"One pathology proportion: {total_one_pathology/n:.2f}")
    print(f"Multiple pathologies proportion: {total_multiple_pathologies/n:.2f}")
    
    print(f"\nExact Matches Characteristics:")
    print(f"Exact matches: {exact_matches_percentage:.2f}")
    print(f"Exact no finding: {exact_no_finding_percentage:.2f}")
    print(f"Exact one pathology: {exact_one_pathology_percentage:.2f}")
    print(f"Exact multiple pathologies: {exact_multiple_pathologies_percentage:.2f}")
    
    total_num_of_pathologies = sum([len(pathologies) for pathologies in ground_truth.values()])
    correct_matches_percentage = correct_matches / total_num_of_pathologies
    correct_no_finding_percentage = correct_no_finding / total_no_finding if total_no_finding > 0 else 0
    correct_pathology_percentage = correct_pathology / (total_num_of_pathologies - total_no_finding) if total_num_of_pathologies - total_no_finding > 0 else 0
    
    print(f"\nIndividual Pathology Characteristics:")
    print(f"Correct matches: {correct_matches_percentage:.2f}")
    print(f"Correct no finding: {correct_no_finding_percentage:.2f}")
    print(f"Correct pathology: {correct_pathology_percentage:.2f}")

    return exact_matches / n, exact_no_finding / n, exact_one_pathology / n, exact_multiple_pathologies / n, correct_matches / total_num_of_pathologies, correct_no_finding / total_num_of_pathologies, correct_pathology / total_num_of_pathologies


## CheXpert specific

In [4]:
# CheXpert paths
cheXpert_test_ground_truth_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/CheXpert/test_written_pathologies")
cheXpert_layer_norm_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/probe_results/chexpert_layer_norm_predictions.csv")
cheXpert_q_former_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/probe_results/chexpert_q_former_predictions.csv")
cheXpert_cheXagent_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/identify_pathologies")
cheXpert_cheXagent_half_temperature_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/chexpert_identify_pathologies_0.5")

cheXpert_xrv_224_chex_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/torchxrayvision/evaluations/CheXpert_evaluation_results/test_set/xrv_224_chex.txt")
cheXpert_xrv_224_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/torchxrayvision/evaluations/CheXpert_evaluation_results/test_set/xrv_224.txt")
cheXpert_xrv_512_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/torchxrayvision/evaluations/CheXpert_evaluation_results/test_set/xrv_512.txt")


# CheXpert pathologies
cheXpert_pathologies = ['No Finding','Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity',
        'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis','Pneumothorax',
        'Pleural Effusion','Pleural Other','Fracture','Support Devices']

In [6]:
cheXpert_test_ground_truth = build_image_id_to_pathology_dict(cheXpert_test_ground_truth_path,cheXpert_pathologies, gt_header = False)
cheXpert_layer_norm_predictions = build_image_id_to_pathology_dict(cheXpert_layer_norm_predictions_path,cheXpert_pathologies, gt_header = False)
cheXpert_q_former_predictions = build_image_id_to_pathology_dict(cheXpert_q_former_predictions_path,cheXpert_pathologies, gt_header = False)
cheXpert_cheXagent_predictions = build_image_id_to_pathology_dict(cheXpert_cheXagent_predictions_path,cheXpert_pathologies, gt_header = False)
cheXpert_cheXagent_half_temperature_predictions = build_image_id_to_pathology_dict(cheXpert_cheXagent_half_temperature_predictions_path,cheXpert_pathologies, gt_header = False)

cheXpert_xrv_224_chex = build_image_id_to_pathology_dict(cheXpert_xrv_224_chex_path,cheXpert_pathologies, gt_header = False)
cheXpert_xrv_224 = build_image_id_to_pathology_dict(cheXpert_xrv_224_path,cheXpert_pathologies, gt_header = False)
cheXpert_xrv_512 = build_image_id_to_pathology_dict(cheXpert_xrv_512_path,cheXpert_pathologies, gt_header = False)

In [7]:
print("CheXpert Layer Norm:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_layer_norm_predictions)

CheXpert Layer Norm:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.22
Exact no finding: 0.61
Exact one pathology: 0.26
Exact multiple pathologies: 0.03

Individual Pathology Characteristics:
Correct matches: 0.54
Correct no finding: 0.62
Correct pathology: 0.54


In [8]:
print("CheXpert Q-Former:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_q_former_predictions)


CheXpert Q-Former:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.29
Exact no finding: 0.90
Exact one pathology: 0.31
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.44
Correct no finding: 0.91
Correct pathology: 0.39


In [9]:
print("CheXagent:")
synonym_mappings = [
    {'no finding', 'no pathologies'}, 
    {'enlarged cardiomediastinum', 'enlarged cardiac silhouette'},
    {'edema','pulmonary edema/hazy opacity'}
]
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_cheXagent_predictions, synonym_mappings)

CheXagent:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.12
Exact no finding: 0.43
Exact one pathology: 0.06
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.20
Correct no finding: 0.43
Correct pathology: 0.18


In [10]:
print("CheXagent Half Temperature:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_cheXagent_half_temperature_predictions, synonym_mappings)

CheXagent Half Temperature:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.12
Exact no finding: 0.43
Exact one pathology: 0.06
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.20
Correct no finding: 0.43
Correct pathology: 0.18


In [9]:
print("XRV-224-CheX:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_xrv_224_chex)

XRV-224-CheX:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.07
Exact no finding: 0.30
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.64
Correct no finding: 0.30
Correct pathology: 0.68


In [11]:
print("XRV-224:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_xrv_224)

XRV-224:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.01
Exact no finding: 0.05
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.64
Correct no finding: 0.05
Correct pathology: 0.70


In [11]:
print("XRV-512:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_xrv_512)

XRV-512:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.18
Multiple pathologies proportion: 0.57

Exact Matches Characteristics:
Exact matches: 0.23
Exact no finding: 0.94
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.11
Correct no finding: 0.94
Correct pathology: 0.02


## VinDr specific

In [12]:
#VinDr paths
vinDr_test_ground_truth_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/test_set_three_splits/VinDr_test_test_split_with_labels.csv")
vinDr_layer_norm_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/probe_results/vindr_layer_norm_predictions.csv")
vinDr_q_former_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/probe_results/vindr_q_former_predictions.csv")
vinDr_chexagent_predictions_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/CheXagent/test_split_identify_pathologies")
vinDr_chexagent_predictions_half_temperature_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/CheXagent/test_split_identify_pathologies_0.5")
vinDr_chexagent_predictions_half_temperature_path_2 = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/CheXagent/test_split_identify_pathologies_0.5_2")

vinDr_xrv_224_chex_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/torchxrayvision/evaluations/VinDr_evaluation_results/xrv_224_chex.txt")
vinDr_xrv_224_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/torchxrayvision/evaluations/VinDr_evaluation_results/xrv_224.txt")
vinDr_xrv_512_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/torchxrayvision/evaluations/VinDr_evaluation_results/xrv_512.txt")


vindr_pathologies = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly",
            "Clavicle fracture", "Consolidation", "Emphysema", "Enlarged PA",
            "ILD", "Infiltration", "Lung Opacity", "Lung cavity", "Lung cyst",
            "Mediastinal shift","Nodule/Mass", "Pleural effusion", "Pleural thickening",
            "Pneumothorax", "Pulmonary fibrosis","Rib fracture", "Other lesion",
            "No finding"] 


In [14]:
vinDr_test_ground_truth = build_image_id_to_pathology_dict(vinDr_test_ground_truth_path, vindr_pathologies, gt_header = False)
vinDr_layer_norm_predictions = build_image_id_to_pathology_dict(vinDr_layer_norm_predictions_path, vindr_pathologies, gt_header = False)
vinDr_q_former_predictions = build_image_id_to_pathology_dict(vinDr_q_former_predictions_path, vindr_pathologies, gt_header = False)
vinDr_chexagent_predictions = build_image_id_to_pathology_dict(vinDr_chexagent_predictions_path, vindr_pathologies, gt_header = False)
vinDr_chexagent_predictions_half_temperature = build_image_id_to_pathology_dict(vinDr_chexagent_predictions_half_temperature_path, vindr_pathologies, gt_header = False)
vinDr_chexagent_predictions_half_temperature_path_2 = build_image_id_to_pathology_dict(vinDr_chexagent_predictions_half_temperature_path_2, vindr_pathologies, gt_header = False)

vinDr_xrv_224_chex = build_image_id_to_pathology_dict(vinDr_xrv_224_chex_path, vindr_pathologies, gt_header = False)
vinDr_xrv_224 = build_image_id_to_pathology_dict(vinDr_xrv_224_path, vindr_pathologies, gt_header = False)
vinDr_xrv_512 = build_image_id_to_pathology_dict(vinDr_xrv_512_path, vindr_pathologies, gt_header = False)


In [15]:
print("VinDr Layer Norm:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_layer_norm_predictions)

VinDr Layer Norm:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.70
Exact no finding: 0.98
Exact one pathology: 0.34
Exact multiple pathologies: 0.02

Individual Pathology Characteristics:
Correct matches: 0.59
Correct no finding: 0.99
Correct pathology: 0.27


In [16]:
print("VinDr Q-Former:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_q_former_predictions)

VinDr Q-Former:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.71
Exact no finding: 0.97
Exact one pathology: 0.46
Exact multiple pathologies: 0.03

Individual Pathology Characteristics:
Correct matches: 0.62
Correct no finding: 0.98
Correct pathology: 0.33


In [17]:
print("VinDr CheXagent:")
synonym_mappings = [
    {'no finding', 'no pathologies'}, 
    {'enlarged cardiomediastinum', 'enlarged cardiac silhouette'},
    {'edema','pulmonary edema/hazy opacity'}
]
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_chexagent_predictions, synonym_mappings)

VinDr CheXagent:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.17
Exact no finding: 0.25
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.14
Correct no finding: 0.25
Correct pathology: 0.05


In [18]:
print("VinDr CheXagent Half Temperature:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_chexagent_predictions_half_temperature, synonym_mappings)

VinDr CheXagent Half Temperature:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.19
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.11
Correct no finding: 0.19
Correct pathology: 0.05


In [28]:
print("XRV-224-CheX:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_xrv_224_chex)

XRV-224-CheX:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.33
Exact no finding: 0.48
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.34
Correct no finding: 0.48
Correct pathology: 0.23


In [29]:
print("XRV-224:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_xrv_224)

XRV-224:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.13
Exact no finding: 0.19
Exact one pathology: 0.00
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.23
Correct no finding: 0.19
Correct pathology: 0.26


In [30]:
print("XRV-512:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_xrv_512)

XRV-512:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.09
Multiple pathologies proportion: 0.23

Exact Matches Characteristics:
Exact matches: 0.66
Exact no finding: 0.96
Exact one pathology: 0.05
Exact multiple pathologies: 0.00

Individual Pathology Characteristics:
Correct matches: 0.44
Correct no finding: 0.96
Correct pathology: 0.01


## Testing CheXpert models on the VinDr dataset

In [30]:
## testing CheXpert models on VinDr data
vinDr_by_chexpert_layer_norm_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/VinDr/probe_results/vindr_predictions_by_chexpert_post_layer_norm.csv")
vinDr_by_chexpert_layer_norm = build_image_id_to_pathology_dict(vinDr_by_chexpert_layer_norm_path, cheXpert_pathologies, gt_header = False)

# ignore all pathologies not in the intersection of CheXpert and VinDr pathologies
# lowercase both pathology sets
cheXpert_pathologies_lower_case = {pathology.lower() for pathology in cheXpert_pathologies}
vindr_pathologies_lower_case = {pathology.lower() for pathology in vindr_pathologies}
intersection_pathologies = cheXpert_pathologies_lower_case.intersection(vindr_pathologies_lower_case)
ignore_pathologies = cheXpert_pathologies_lower_case.union(vindr_pathologies_lower_case) - intersection_pathologies


In [31]:
print("VinDr by CheXpert Layer Norm on intersection of pathologies:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_by_chexpert_layer_norm, ignore_pathologies = ignore_pathologies)

VinDr by CheXpert Layer Norm on intersection of pathologies:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.17
Multiple pathologies proportion: 0.15

Exact Matches Characteristics:
Exact matches: 0.58
Exact no finding: 0.73
Exact one pathology: 0.30
Exact multiple pathologies: 0.24

Individual Pathology Characteristics:
Correct matches: 0.68
Correct no finding: 0.77
Correct pathology: 0.46


In [55]:
synonym_mappings_vindr_chexpert = [
    {'enlarged cardiomediastinum' , 'aortic enlargement'},
    {'fracture', 'clavicle fracture', 'rib fracture'},
    {'pleural other', 'pleural thickening'}   
]
# update the ignore pathologies to exclude the synonyms
ignore_pathologies_without_synonyms = ignore_pathologies - set.union(*[set(synonyms) for synonyms in synonym_mappings_vindr_chexpert])

print("VinDr by CheXpert Layer Norm on intersection of pathologies with synonyms:")
_ = calculate_metrics(vinDr_test_ground_truth, vinDr_by_chexpert_layer_norm, ignore_pathologies = ignore_pathologies_without_synonyms, synonym_mappings = synonym_mappings_vindr_chexpert)


VinDr by CheXpert Layer Norm on intersection of pathologies with synonyms:
Dataset Characteristics:
No finding proportion: 0.68
One pathology proportion: 0.17
Multiple pathologies proportion: 0.15

Exact Matches Characteristics:
Exact matches: 0.58
Exact no finding: 0.73
Exact one pathology: 0.30
Exact multiple pathologies: 0.24

Individual Pathology Characteristics:
Correct matches: 0.68
Correct no finding: 0.77
Correct pathology: 0.46


## Testing VinDr models on CheXpert

In [57]:
cheXpert_by_vindr_layer_norm_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXpert/probe_results/chexpert_predictions_by_vindr_model.csv")
cheXpert_by_vindr_layer_norm = build_image_id_to_pathology_dict(cheXpert_by_vindr_layer_norm_path, vindr_pathologies, gt_header = False)

print("CheXpert by VinDr Layer Norm on intersection of pathologies:")
_ = calculate_metrics(cheXpert_test_ground_truth, cheXpert_by_vindr_layer_norm, ignore_pathologies = ignore_pathologies_without_synonyms, synonym_mappings = synonym_mappings_vindr_chexpert)

CheXpert by VinDr Layer Norm on intersection of pathologies:
Dataset Characteristics:
No finding proportion: 0.25
One pathology proportion: 0.14
Multiple pathologies proportion: 0.62

Exact Matches Characteristics:
Exact matches: 0.19
Exact no finding: 0.65
Exact one pathology: 0.04
Exact multiple pathologies: 0.05

Individual Pathology Characteristics:
Correct matches: 0.59
Correct no finding: 0.69
Correct pathology: 0.57


## CheXbench Evaluations 

In [26]:
image_text_reasoning_evaluation_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXbench/openi_itr_responses_answer_guidance")
image_text_reasoning_evaluation_path_unguided = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/evaluation/CheXbench/openi_itr_responses_unguided")
image_text_reasoning_gt = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/CheXbench/image_text_reasoning_task')

def eval_cheXbench_image_text_reasoning(gt_path, pred_path, gt_header = True, pred_header = True):
    image_text_reasoning_predictions = build_image_id_to_pathology_dict(pred_path, cheXpert_pathologies, gt_header = gt_header, pathologies_check= False)

    total_left_vs_right = 0
    total_mild_vs_severe = 0
    total_upper_vs_lower = 0

    correct_left_vs_right = 0
    correct_mild_vs_severe = 0
    correct_upper_vs_lower = 0

    with open(gt_path, 'r') as f:
        if gt_header:
            f.readline() # skip header

        for line in f.readlines():
            components = line.split(',')
            image_id = components[2]
            correct_option = components[4]
            option_0 = components[5]
            option_1 = components[6]

            gt = option_0 if correct_option == 0 else option_1

            possibilities = image_text_reasoning_predictions.get(image_id)
            correct = any([gt in possibility for possibility in possibilities])

            if ("left" in option_0 and "right" in option_1) or ("right" in option_0 and "left" in option_1):
                total_left_vs_right += 1
                correct_left_vs_right = correct_left_vs_right + 1 if correct else correct_left_vs_right

            elif ("mild" in option_0 and "severe" in option_1) or ("severe" in option_0 and "mild" in option_1):
                total_mild_vs_severe += 1
                correct_mild_vs_severe = correct_mild_vs_severe + 1 if correct else correct_mild_vs_severe
            
            else:
                total_upper_vs_lower += 1
                correct_upper_vs_lower = correct_upper_vs_lower + 1 if correct else correct_upper_vs_lower
            
    print(f"Image Text Reasoning Task: {str(pred_path).split('/')[-3:]}")
    print(f"Left vs Right: {correct_left_vs_right/total_left_vs_right:.2f}")
    print(f"Mild vs Severe: {correct_mild_vs_severe/total_mild_vs_severe:.2f}")
    print(f"Upper vs Lower: {correct_upper_vs_lower/total_upper_vs_lower:.2f}")

    print(f"Overall: {(correct_left_vs_right + correct_mild_vs_severe + correct_upper_vs_lower)/(total_left_vs_right + total_mild_vs_severe + total_upper_vs_lower):.2f}")

eval_cheXbench_image_text_reasoning(image_text_reasoning_gt, image_text_reasoning_evaluation_path)
eval_cheXbench_image_text_reasoning(image_text_reasoning_gt, image_text_reasoning_evaluation_path_unguided)

Image Text Reasoning Task: ['evaluation', 'CheXbench', 'openi_itr_responses_answer_guidance']
Left vs Right: 0.54
Mild vs Severe: 0.60
Upper vs Lower: 0.66
Overall: 0.56
Image Text Reasoning Task: ['evaluation', 'CheXbench', 'openi_itr_responses_unguided']
Left vs Right: 0.56
Mild vs Severe: 0.78
Upper vs Lower: 0.66
Overall: 0.60
