In [13]:
import pandas as pd
from pathlib import Path
import pickle
import numpy as np
from collections import defaultdict

VinDr & CheXpert agnostic

In [14]:
def create_embeddings_only_dataframe_from_collect_embeddings_dict_path(collected_embeddings_dict_path):
    collected_embeddings_dict = pickle.load(open(collected_embeddings_dict_path, "rb"))
    rows = []

    for index, (key, value) in enumerate(collected_embeddings_dict.items()):
        row = {
            "image_id": key,
            "patch_embeddings": value["patch_embeddings"],
            "post_layer_norm": value["post_layer_norm"],
            "q_former": value["q_former"],
            "language_projection": value["language_projection"],
        }
        rows.append(row)
    df_embeddings = pd.DataFrame(rows, columns=["image_id", "patch_embeddings", "post_layer_norm", "q_former", "language_projection"])

    return df_embeddings


VinDr Specific

In [2]:
def create_locations_only_dataframe_from_reference_file_VinDr(reference_file_path):
    labels = set()
    image_id_to_label_and_location = defaultdict(list)
    with open(reference_file_path, "r") as f:
        f.readline() # skip header
        for line in f:
            image_id, label, location = line.strip().split(",")
            image_id_to_label_and_location[image_id].append((label, location))
            labels.add(label)
    
    column_names = ["image_id"] + [f"{label}" for label in labels] + [f"{label} position" for label in labels]
    default_row_data = {label: False for label in labels}
    default_row_data.update({f"{label} position": "none" for label in labels})
    rows = []

    for image_id, label_and_location_list in image_id_to_label_and_location.items():
        row = default_row_data.copy()
        row["image_id"] = image_id
        for label, location in label_and_location_list:
            row[label] = True
            row[f"{label} position"] = location
        rows.append(row)

    df_locations_only = pd.DataFrame(rows, columns=column_names)
    return df_locations_only


def merge_embeddings_and_locations(embeddings_df, locations_df):
    merged = pd.merge(embeddings_df, locations_df, on="image_id")
    return merged

def convert_merged_embeddings_and_locations_to_one_hot_encoding_by_pathology_location(merged_embeddings_and_locations):
    columns_to_keep = ['image_id', 'patch_embeddings', 'post_layer_norm', 'q_former', 'language_projection']
    position_columns = [col for col in merged_embeddings_and_locations.columns if 'position' in col and col != 'No finding position']
    merged_df_with_one_hot_pathology_locations = merged_embeddings_and_locations[columns_to_keep + position_columns]

    # 2. Generate 'left XXX' and 'right XXX' columns and initialize with zeros
    for col in position_columns:
        condition_name = col.replace(' position', '')
        merged_df_with_one_hot_pathology_locations[f'left {condition_name}'] = 0
        merged_df_with_one_hot_pathology_locations[f'right {condition_name}'] = 0

    # 3. Iterate over the DataFrame and set 1 for left/right as necessary
    for index, row in merged_df_with_one_hot_pathology_locations.iterrows():
        for pos_col in position_columns:
            condition_name = pos_col.replace(' position', '')
            if 'left' in row[pos_col]:
                merged_df_with_one_hot_pathology_locations.loc[index, f'left {condition_name}'] = 1
                # new_df.at[index, f'left {condition_name}'] = 1
            if 'right' in row[pos_col]:
                merged_df_with_one_hot_pathology_locations.loc[index, f'right {condition_name}'] = 1
                # new_df.at[index, f'right {condition_name}'] = 1

    # Dropping the original position columns
    merged_df_with_one_hot_pathology_locations.drop(columns=position_columns, inplace=True)
    return merged_df_with_one_hot_pathology_locations


In [11]:
collected_embeddings_dict_path = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/VinDr/embeddings_only_dict/test_all_embeddings_dict.pkl')
reference_locations_file_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/image_text_reasoning_datasets/test_all_left_or_right")
collected_embeddings_df = create_embeddings_only_dataframe_from_collect_embeddings_dict_path(collected_embeddings_dict_path)
locations_df = create_locations_only_dataframe_from_reference_file_VinDr(reference_locations_file_path)
merge_embeddings_and_locations = merge_embeddings_and_locations(collected_embeddings_df, locations_df)
merged_df_with_one_hot_pathology_locations = convert_merged_embeddings_and_locations_to_one_hot_encoding_by_pathology_location(merge_embeddings_and_locations)

CheXpert Specific

In [7]:
def convert_CheXpert_csv_to_df(chexpert_csv_path):
    columns_to_convert_to_int = [
        'No Finding','Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity',
        'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis','Pneumothorax',
        'Pleural Effusion','Pleural Other','Fracture','Support Devices']#,'Age' # age not in test set

    with open(chexpert_csv_path, "r") as f:
        chexpert_df = pd.read_csv(f)
        chexpert_df[columns_to_convert_to_int] = chexpert_df[columns_to_convert_to_int].fillna(0).astype(int)
        chexpert_df["Path"] = chexpert_df["Path"].apply(lambda x: '/'.join(x.split('/')[-3:]))
    return chexpert_df

def merge_CheXpert_embeddings_and_locations(embeddings_df, chexpert_df):
    merged = pd.merge(embeddings_df, chexpert_df, left_on="image_id", right_on="Path").drop(columns=["Path"])
    return merged

In [15]:
embeddings_CheXpert_dict_path = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/CheXpert-small/embeddings_only_dict/chexpert_train_5002_10001_dict.pkl')
embeddings_CheXpert_df = create_embeddings_only_dataframe_from_collect_embeddings_dict_path(embeddings_CheXpert_dict_path)

# chexpert_csv_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/CheXpert/train_sample_paths/5,000.csv")
chexpert_test_csv_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/CheXpert/train_sample_paths/10,000.csv")

chexpert_pathology_df = convert_CheXpert_csv_to_df(chexpert_test_csv_path)

merged_CheXpert_df = merge_CheXpert_embeddings_and_locations(embeddings_CheXpert_df, chexpert_pathology_df)

In [17]:
embeddings_CheXpert_df.head()

Unnamed: 0,image_id,patch_embeddings,post_layer_norm,q_former,language_projection
0,patient15975/study3/view1_frontal.jpg,[[[[-0.1558 -0.1578 -0.1583 -0.1562 -0...,"[[1.273, 0.5244, -0.03088, 0.05408, 1.359, 0.5...","[[[1.019, -0.471, 1.163, 0.6694, 0.00214, -0.4...","[[[0.515, -0.3674, -0.3328, 0.7563, 0.07947, -..."
1,patient15988/study1/view1_frontal.jpg,[[[[-0.2012 -0.2012 -0.2756 -0.3809 -0.285...,"[[-0.3794, -1.029, -0.0471, -0.9253, 1.742, 1....","[[[-0.3745, -0.03934, 0.7383, 0.0373, -0.4219,...","[[[0.4111, -0.0527, -0.743, 0.785, -0.1957, -0..."
2,patient15989/study3/view1_frontal.jpg,[[[[-0.1804 -0.1804 -0.1804 -0.1786 -0.177...,"[[-0.11426, 0.9746, 0.5996, -0.8228, 2.998, 0....","[[[-1.077, -0.3274, 0.03067, -0.2573, -0.2255,...","[[[-0.2815, 0.2198, -0.4973, 0.252, -0.8877, 0..."
3,patient15997/study1/view1_frontal.jpg,[[[[-0.2102 -0.2035 -0.204 -0.1553 -0...,"[[1.23, 0.082, -0.04407, -0.9585, 2.607, 0.221...","[[[-0.3474, 0.575, -0.508, 0.1376, -0.6655, -0...","[[[-0.5073, 0.3545, -0.05658, 0.882, -0.1976, ..."
4,patient15997/study2/view1_frontal.jpg,[[[[-0.1991 -0.2136 -0.1929 -0.2062 -0.190...,"[[-0.5054, 1.371, 0.3254, -1.27, 1.829, 0.652,...","[[[-0.2583, -0.3179, -0.0903, 0.5796, -1.316, ...","[[[0.2734, 0.1741, -0.3115, 0.9272, -0.3972, 0..."


In [16]:
merged_CheXpert_df.head()

Unnamed: 0,image_id,patch_embeddings,post_layer_norm,q_former,language_projection,Sex,Age,Frontal/Lateral,AP/PA,No Finding,...,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,patient15975/study3/view1_frontal.jpg,[[[[-0.1558 -0.1578 -0.1583 -0.1562 -0...,"[[1.273, 0.5244, -0.03088, 0.05408, 1.359, 0.5...","[[[1.019, -0.471, 1.163, 0.6694, 0.00214, -0.4...","[[[0.515, -0.3674, -0.3328, 0.7563, 0.07947, -...",Female,78,Frontal,AP,0,...,0,1,0,0,0,0,1,0,0,0
1,patient15988/study1/view1_frontal.jpg,[[[[-0.2012 -0.2012 -0.2756 -0.3809 -0.285...,"[[-0.3794, -1.029, -0.0471, -0.9253, 1.742, 1....","[[[-0.3745, -0.03934, 0.7383, 0.0373, -0.4219,...","[[[0.4111, -0.0527, -0.743, 0.785, -0.1957, -0...",Female,51,Frontal,AP,0,...,0,0,0,0,0,0,1,0,0,1
2,patient15989/study3/view1_frontal.jpg,[[[[-0.1804 -0.1804 -0.1804 -0.1786 -0.177...,"[[-0.11426, 0.9746, 0.5996, -0.8228, 2.998, 0....","[[[-1.077, -0.3274, 0.03067, -0.2573, -0.2255,...","[[[-0.2815, 0.2198, -0.4973, 0.252, -0.8877, 0...",Male,75,Frontal,AP,0,...,0,1,0,0,0,0,0,0,0,1
3,patient15997/study1/view1_frontal.jpg,[[[[-0.2102 -0.2035 -0.204 -0.1553 -0...,"[[1.23, 0.082, -0.04407, -0.9585, 2.607, 0.221...","[[[-0.3474, 0.575, -0.508, 0.1376, -0.6655, -0...","[[[-0.5073, 0.3545, -0.05658, 0.882, -0.1976, ...",Female,54,Frontal,PA,0,...,0,0,0,0,0,0,1,0,0,1
4,patient15997/study2/view1_frontal.jpg,[[[[-0.1991 -0.2136 -0.1929 -0.2062 -0.190...,"[[-0.5054, 1.371, 0.3254, -1.27, 1.829, 0.652,...","[[[-0.2583, -0.3179, -0.0903, 0.5796, -1.316, ...","[[[0.2734, 0.1741, -0.3115, 0.9272, -0.3972, 0...",Female,54,Frontal,AP,1,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# save merge_embeddings_and_locations to pickle 
save_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/CheXpert-small/collated_train_5002_10001_df.pkl")
pickle.dump(merged_CheXpert_df, open(save_path, "wb"))