In [31]:
import pandas as pd
from pathlib import Path
import pickle
import numpy as np
from collections import defaultdict

In [55]:
def create_embeddings_only_dataframe_from_collect_embeddings_dict_path(collected_embeddings_dict_path):
    collected_embeddings_dict = pickle.load(open(collected_embeddings_dict_path, "rb"))
    rows = []

    for index, (key, value) in enumerate(collected_embeddings_dict.items()):
        row = {
            "image_id": key,
            "patch_embeddings": value["patch_embeddings"],
            "post_layer_norm": value["post_layer_norm"],
            "q_former": value["q_former"],
            "language_projection": value["language_projection"],
        }
        rows.append(row)
    df_embeddings = pd.DataFrame(rows, columns=["image_id", "patch_embeddings", "post_layer_norm", "q_former", "language_projection"])

    return df_embeddings


def create_locations_only_dataframe_from_reference_file(reference_file_path):
    labels = set()
    image_id_to_label_and_location = defaultdict(list)
    with open(reference_file_path, "r") as f:
        f.readline() # skip header
        for line in f:
            image_id, label, location = line.strip().split(",")
            image_id_to_label_and_location[image_id].append((label, location))
            labels.add(label)
    
    column_names = ["image_id"] + [f"{label}" for label in labels] + [f"{label} position" for label in labels]
    default_row_data = {label: False for label in labels}
    default_row_data.update({f"{label} position": "none" for label in labels})
    rows = []

    for image_id, label_and_location_list in image_id_to_label_and_location.items():
        row = default_row_data.copy()
        row["image_id"] = image_id
        for label, location in label_and_location_list:
            row[label] = True
            row[f"{label} position"] = location
        rows.append(row)

    df_locations_only = pd.DataFrame(rows, columns=column_names)
    return df_locations_only


def merge_embeddings_and_locations(embeddings_df, locations_df):
    merged = pd.merge(embeddings_df, locations_df, on="image_id")
    return merged

def convert_merged_embeddings_and_locations_to_one_hot_encoding_by_pathology_location(merged_embeddings_and_locations):
    columns_to_keep = ['image_id', 'patch_embeddings', 'post_layer_norm', 'q_former', 'language_projection']
    position_columns = [col for col in merged_embeddings_and_locations.columns if 'position' in col and col != 'No finding position']
    merged_df_with_one_hot_pathology_locations = merged_embeddings_and_locations[columns_to_keep + position_columns]

    # 2. Generate 'left XXX' and 'right XXX' columns and initialize with zeros
    for col in position_columns:
        condition_name = col.replace(' position', '')
        merged_df_with_one_hot_pathology_locations[f'left {condition_name}'] = 0
        merged_df_with_one_hot_pathology_locations[f'right {condition_name}'] = 0

    # 3. Iterate over the DataFrame and set 1 for left/right as necessary
    for index, row in merged_df_with_one_hot_pathology_locations.iterrows():
        for pos_col in position_columns:
            condition_name = pos_col.replace(' position', '')
            if 'left' in row[pos_col]:
                merged_df_with_one_hot_pathology_locations.loc[index, f'left {condition_name}'] = 1
                # new_df.at[index, f'left {condition_name}'] = 1
            if 'right' in row[pos_col]:
                merged_df_with_one_hot_pathology_locations.loc[index, f'right {condition_name}'] = 1
                # new_df.at[index, f'right {condition_name}'] = 1

    # Dropping the original position columns
    merged_df_with_one_hot_pathology_locations.drop(columns=position_columns, inplace=True)
    return merged_df_with_one_hot_pathology_locations


In [56]:
collected_embeddings_dict_path = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/train_pathology_unanimous_agreement_random_radiologist.pkl')
reference_locations_file_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/image_text_reasoning_datasets/train_pathology_left_or_right_unaninmous_agreement_random_radiologist")
collected_embeddings_df = create_embeddings_only_dataframe_from_collect_embeddings_dict_path(collected_embeddings_dict_path)
locations_df = create_locations_only_dataframe_from_reference_file(reference_locations_file_path)
merge_embeddings_and_locations = merge_embeddings_and_locations(collected_embeddings_df, locations_df)
merged_df_with_one_hot_pathology_locations = convert_merged_embeddings_and_locations_to_one_hot_encoding_by_pathology_location(merge_embeddings_and_locations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_with_one_hot_pathology_locations[f'left {condition_name}'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_with_one_hot_pathology_locations[f'right {condition_name}'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_with_one_hot_pathology_locations[f'left {conditi

In [61]:
# save merge_embeddings_and_locations to pickle 
save_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/collated_train_pathology_only.pkl")
pickle.dump(merge_embeddings_and_locations, open(save_path, "wb"))

save_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/collated_train_pathology_only_one_hot.pkl")
pickle.dump(merged_df_with_one_hot_pathology_locations, open(save_path, "wb"))