In [1]:
import pandas as pd
import numpy as np
import os
import json
from PIL import Image
import matplotlib.pyplot as plt
from hergen.utils.constants import CHEXPERT_COMPETITION_TASKS, CHEXPERT_TASKS, CHEXPERT_UNCERTAIN_MAPPINGS

In [2]:
# TODO: change your path here:
mimic_dir = "/data1/r20user2/CXR_dataset/mimic_data/2.0.0"
mimic_annotation_dir = "/data1/r20user2/CXR_dataset/knowledge_graph"
mimic_img_dir = os.path.join(mimic_dir, "files")

In [3]:
master_df = pd.read_csv(os.path.join(mimic_dir, "master.csv"))
# keep one image one study
# master_df.drop_duplicates(subset=["subject_id", "study_id"], inplace=True)

chexpert_df = pd.read_csv(os.path.join(mimic_dir, "mimic-cxr-2.0.0-chexpert.csv"))
chexpert_df.fillna(0, inplace=True)

master_df["report"] = master_df["impression"] + " " + master_df["findings"]

In [4]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232421 entries, 0 to 232420
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Path          232421 non-null  object 
 1   dicom_id      232421 non-null  object 
 2   subject_id    232421 non-null  int64  
 3   study_id      232421 non-null  object 
 4   ViewPosition  232421 non-null  object 
 5   StudyDate     232421 non-null  int64  
 6   StudyTime     232421 non-null  float64
 7   impression    232421 non-null  object 
 8   findings      232421 non-null  object 
 9   split         232421 non-null  object 
 10  report        232421 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 19.5+ MB


In [5]:
master_df["split"].value_counts()

split
train    227459
test       3082
valid      1880
Name: count, dtype: int64

In [6]:
# # only use training set
# split = "train"
# split_df = master_df[master_df["split"] == split]
# split_df["imgpath"] = split_df["Path"].apply(lambda x: os.path.join(mimic_img_dir, x))
# split_df["study_id"] = split_df["study_id"].apply(lambda x: int(x[1:]))

master_df["study_id"] = master_df["study_id"].apply(lambda x: int(x[1:]))
merged_df = pd.merge(master_df, chexpert_df, how="left", on=["subject_id", "study_id"])

In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232421 entries, 0 to 232420
Data columns (total 25 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Path                        232421 non-null  object 
 1   dicom_id                    232421 non-null  object 
 2   subject_id                  232421 non-null  int64  
 3   study_id                    232421 non-null  int64  
 4   ViewPosition                232421 non-null  object 
 5   StudyDate                   232421 non-null  int64  
 6   StudyTime                   232421 non-null  float64
 7   impression                  232421 non-null  object 
 8   findings                    232421 non-null  object 
 9   split                       232421 non-null  object 
 10  report                      232421 non-null  object 
 11  Atelectasis                 232421 non-null  float64
 12  Cardiomegaly                232421 non-null  float64
 13  Consolidation 

In [8]:
np.random.seed(42)

train_df = merged_df.loc[merged_df["split"] == "train"]
print(merged_df.shape)

task_dfs = []
for i, t in enumerate(CHEXPERT_COMPETITION_TASKS):
    index = np.zeros(14)
    index[i] = 1
    df_task = train_df[
        (train_df["Atelectasis"] == index[0])
        & (train_df["Cardiomegaly"] == index[1])
        & (train_df["Consolidation"] == index[2])
        & (train_df["Edema"] == index[3])
        & (train_df["Pleural Effusion"] == index[4])
        & (train_df["Enlarged Cardiomediastinum"] == index[5])
        & (train_df["Lung Lesion"] == index[7])
        & (train_df["Lung Opacity"] == index[8])
        & (train_df["Pneumonia"] == index[9])
        & (train_df["Pneumothorax"] == index[10])
        & (train_df["Pleural Other"] == index[11])
        & (train_df["Fracture"] == index[12])
        & (train_df["Support Devices"] == index[13])
    ]
    df_task = df_task.sample(n=200, random_state=42)
    task_dfs.append(df_task)
df_200 = pd.concat(task_dfs)

mimic_df_200 = df_200[["subject_id", "study_id", "dicom_id", "ViewPosition", "report"] + CHEXPERT_TASKS]
mimic_df_200.to_csv("mimic-cxr-5x200-val-meta.csv", index=False)

merged_df = merged_df[~merged_df["Path"].isin(df_200["Path"])]
print(merged_df.shape)

(232421, 25)
(231421, 25)


In [9]:
annotation_file = os.path.join(mimic_annotation_dir, "mimic_annotation.json")

with open(annotation_file) as f:
    all_examples = json.load(f)

In [10]:
len(all_examples["test"])

3858

In [11]:
merged_df = merged_df.loc[:, ["dicom_id", "ViewPosition", "StudyDate", "StudyTime", "split"] + CHEXPERT_TASKS]

In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231421 entries, 0 to 232420
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   dicom_id                    231421 non-null  object 
 1   ViewPosition                231421 non-null  object 
 2   StudyDate                   231421 non-null  int64  
 3   StudyTime                   231421 non-null  float64
 4   split                       231421 non-null  object 
 5   No Finding                  231421 non-null  float64
 6   Enlarged Cardiomediastinum  231421 non-null  float64
 7   Cardiomegaly                231421 non-null  float64
 8   Lung Lesion                 231421 non-null  float64
 9   Lung Opacity                231421 non-null  float64
 10  Edema                       231421 non-null  float64
 11  Consolidation               231421 non-null  float64
 12  Pneumonia                   231421 non-null  float64
 13  Atelectasis        

In [13]:
merged_df["split"].value_counts()

split
train    226459
test       3082
valid      1880
Name: count, dtype: int64

In [14]:
merged_df["split"] = merged_df["split"].replace({"valid": "val"})

In [15]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231421 entries, 0 to 232420
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   dicom_id                    231421 non-null  object 
 1   ViewPosition                231421 non-null  object 
 2   StudyDate                   231421 non-null  int64  
 3   StudyTime                   231421 non-null  float64
 4   split                       231421 non-null  object 
 5   No Finding                  231421 non-null  float64
 6   Enlarged Cardiomediastinum  231421 non-null  float64
 7   Cardiomegaly                231421 non-null  float64
 8   Lung Lesion                 231421 non-null  float64
 9   Lung Opacity                231421 non-null  float64
 10  Edema                       231421 non-null  float64
 11  Consolidation               231421 non-null  float64
 12  Pneumonia                   231421 non-null  float64
 13  Atelectasis        

In [16]:
new_dataset_ds = dict()
for split in ["train", "val", "test"]:
    print(split)
    
    examples = all_examples[split]
    print(len(examples))
    dataset_as_dfs = pd.DataFrame(examples)

    split_df = merged_df[merged_df["split"] == split]
    split_df.drop(columns=["split"], inplace=True)

    # merge json with metadata
    df = pd.merge(dataset_as_dfs, split_df, left_on="id", right_on="dicom_id", how="left")
    # only keep frontal view
    frontal_df = df.loc[df["ViewPosition"].isin(["PA", "AP"])]
    
    frontal_df.drop(columns=["dicom_id"], inplace=True)
    frontal_df.drop_duplicates(subset=["subject_id", "study_id", "report"], inplace=True)
    frontal_df.sort_values(by=["subject_id", "StudyDate", "StudyTime"], inplace=True)
    frontal_df.reset_index(drop=True, inplace=True)

    new_dataset_ds[split] = frontal_df.to_dict("records")
    print(len(frontal_df))
    

train
270790


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split_df.drop(columns=["split"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontal_df.drop(columns=["dicom_id"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontal_df.drop_duplicates(subset=["subject_id", "study_id", "report"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

145471
val
2130
1151
test
3858
2210


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split_df.drop(columns=["split"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontal_df.drop(columns=["dicom_id"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frontal_df.drop_duplicates(subset=["subject_id", "study_id", "report"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

In [18]:
temporal_cxr_dir = os.path.join(mimic_annotation_dir, "../temporal_CXR")
os.makedirs(temporal_cxr_dir, exist_ok=True)
with open(os.path.join(temporal_cxr_dir, "mimic_annotation.json"), "w") as f:
    json.dump(new_dataset_ds, f)

In [24]:
temporal_dataset_ds = dict()
for split in new_dataset_ds.keys():
    examples = new_dataset_ds[split]
    dataset_as_dfs = pd.DataFrame(examples)
    print(f"Number of original {split}: {len(dataset_as_dfs)}")

    subject_cnts = dataset_as_dfs["subject_id"].value_counts()
    # only keep subjects with at least 2 images
    cur_subject_cnts = subject_cnts[subject_cnts >= 2]
    dataset_as_dfs = dataset_as_dfs.loc[dataset_as_dfs["subject_id"].isin(
        cur_subject_cnts.index.tolist())]
    
    temporal_dataset_ds[split] = dataset_as_dfs.to_dict("records")
    print(f"Number of temporal {split}: {len(dataset_as_dfs)}")

Number of original train: 145471
Number of temporal train: 111750
Number of original val: 1151
Number of temporal val: 897
Number of original test: 2210
Number of temporal test: 2183


In [25]:
with open(os.path.join(temporal_cxr_dir, "longitudinal_mimic_annotation.json"), "w") as f:
    json.dump(temporal_dataset_ds, f)