In [1]:
import pandas as pd

In [2]:
# CheXpert images can be found: https://stanfordaimi.azurewebsites.net/datasets/8cbd9ed4-2eb9-4565-affc-111cf4f7ebe2
data_df = pd.read_csv('../data/train_cheXbert.csv')

# Demographic labels can be found: https://stanfordaimi.azurewebsites.net/datasets/192ada7c-4d43-466e-b8bb-b81992bb80cf
demo_df = pd.DataFrame(pd.read_excel("../data/CHEXPERT_DEMO.xlsx", engine='openpyxl')) #pip install openpyxl


In [3]:
#select row 0 2z
demo_df[:0]

Unnamed: 0,PATIENT,GENDER,AGE_AT_CXR,PRIMARY_RACE,ETHNICITY


In [4]:
data_df[:0]

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding


In [5]:
print("Number of images: " + str(len(data_df)))

Number of images: 223414


In [6]:
print("Number of patients: " + str(len(demo_df)))

Number of patients: 65401


In [7]:
split =  data_df.Path.str.split("/", expand = True)
data_df["patient_id"] = split[2]
demo_df = demo_df.rename(columns={'PATIENT': 'patient_id'})
combine_df = data_df.merge(demo_df, on="patient_id")

In [8]:
combine_df.PRIMARY_RACE.value_counts()

PRIMARY_RACE
White                                        102402
Other                                         28095
White, non-Hispanic                           22154
Asian                                         20434
Unknown                                       15186
Black or African American                      9909
Race and Ethnicity Unknown                     8716
Other, Hispanic                                3621
Native Hawaiian or Other Pacific Islander      2809
Asian, non-Hispanic                            2793
Black, non-Hispanic                            2000
White, Hispanic                                 922
Other, non-Hispanic                             566
American Indian or Alaska Native                457
Patient Refused                                 405
Pacific Islander, non-Hispanic                  337
Native American, non-Hispanic                    55
Black, Hispanic                                  52
Asian, Hispanic                                  37

In [9]:
combine_df.insert(3, "race", "")

In [10]:
mask = (combine_df.PRIMARY_RACE.str.contains("Black", na=False))
combine_df.loc[mask, "race"] = "BLACK/AFRICAN AMERICAN"

mask = (combine_df.PRIMARY_RACE.str.contains("White", na=False))
combine_df.loc[mask, "race"] = "WHITE"

mask = (combine_df.PRIMARY_RACE.str.contains("Asian", na=False))
combine_df.loc[mask, "race"] = "ASIAN"

In [11]:
#remove all labels that are not asian, black or white
combine_df = combine_df[combine_df.race.isin(['ASIAN','BLACK/AFRICAN AMERICAN','WHITE'])]

In [12]:
combine_df.ETHNICITY.value_counts()

ETHNICITY
Non-Hispanic/Non-Latino    149268
Unknown                      6546
Hispanic/Latino              4726
Patient Refused               160
Not Hispanic                   15
Hispanic                        1
Name: count, dtype: int64

In [13]:
#keep only non-hispanic labels
combine_df = combine_df[combine_df.ETHNICITY.isin(["Non-Hispanic/Non-Latino","Not Hispanic"])]

In [14]:
#frontal images only (AP/PA)
combine_df = combine_df[combine_df["Frontal/Lateral"]=="Frontal"]

In [15]:
print("Total images after inclusion/exclusion criteria: " + str(len(combine_df)))

Total images after inclusion/exclusion criteria: 127130


In [16]:
print("Total patients after inclusion/exclusion criteria: " + str(combine_df.patient_id.nunique()))

Total patients after inclusion/exclusion criteria: 42884


In [17]:
from sklearn.utils import shuffle
data_df = combine_df
data_df.insert(5, "split","none", True)
unique_sub_id = data_df.patient_id.unique()

train_percent, valid_percent, test_percent = 0.60, 0.10, 0.30

unique_sub_id = shuffle(unique_sub_id)
value1 = (round(len(unique_sub_id)*train_percent))
value2 = (round(len(unique_sub_id)*valid_percent))
value3 = value1 + value2
value4 = (round(len(unique_sub_id)*test_percent))

In [18]:
print("Patients in training set: " + str(value1))

Patients in training set: 25730


In [19]:
print("Patients in validation set: " + str(value2))

Patients in validation set: 4288


In [20]:
print("Patients in testing set: " + str(value4))

Patients in testing set: 12865


In [21]:
data_df = shuffle(data_df)

train_sub_id = unique_sub_id[:value1]
validate_sub_id = unique_sub_id[value1:value3]
test_sub_id = unique_sub_id[value3:]

In [22]:
data_df.loc[data_df.patient_id.isin(train_sub_id), "split"]="train"
data_df.loc[data_df.patient_id.isin(validate_sub_id), "split"]="validate"
data_df.loc[data_df.patient_id.isin(test_sub_id), "split"]="test"

In [23]:
data_df.split.value_counts(normalize=True)

split
train       0.599898
test        0.300346
validate    0.099756
Name: proportion, dtype: float64

In [24]:
data_df.race.value_counts()

race
WHITE                     99037
ASIAN                     18830
BLACK/AFRICAN AMERICAN     9263
Name: count, dtype: int64

In [25]:
data_df.race.value_counts(normalize=True)

race
WHITE                     0.779021
ASIAN                     0.148116
BLACK/AFRICAN AMERICAN    0.072862
Name: proportion, dtype: float64

In [26]:
#clean up
data_df = data_df.sort_values(by=['Path'])
data_df = data_df.reset_index()
data_df = data_df.drop(columns=['index'])

In [28]:
data_df = data_df[data_df["AP/PA"].isin(['AP','PA'])]

In [29]:
train_df = data_df[data_df.split=="train"]
validation_df = data_df[data_df.split=="validate"]
test_df = data_df[data_df.split=="test"]

In [30]:
#False indicates no patient_id shared between groups
import numpy as np
unique_train_id = train_df.patient_id.unique()
unique_validation_id = validation_df.patient_id.unique()
unique_test_id = test_df.patient_id.unique()
all_id = np.concatenate((unique_train_id, unique_validation_id, unique_test_id), axis=None)

def contains_duplicates(X):
    return len(np.unique(X)) != len(X)

contains_duplicates(all_id)

False