In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
data_dir = '../data/'
df_demo = pd.DataFrame(pd.read_excel(data_dir + 'CHEXPERT_DEMO.xlsx', engine='openpyxl'))
# DF demo contains extra information about race and ethnicity
# rename for convinience
df_demo = df_demo.rename(columns={'PRIMARY_RACE': 'race'})
df_demo = df_demo.rename(columns={'PATIENT': 'patient_id'})
df_demo = df_demo.rename(columns={'GENDER': 'sex'})
df_demo = df_demo.rename(columns={'AGE_AT_CXR': 'age'})
df_demo = df_demo.rename(columns={'ETHNICITY': 'ethnicity'})
df_demo = df_demo.drop(['sex', 'age'], axis=1)
print("df demo data")
print(df_demo.shape)
df_demo.head()


df demo data
(65401, 3)


Unnamed: 0,patient_id,race,ethnicity
0,patient24428,White,Non-Hispanic/Non-Latino
1,patient48289,Other,Hispanic/Latino
2,patient33856,White,Non-Hispanic/Non-Latino
3,patient41673,Unknown,Unknown
4,patient48493,White,Non-Hispanic/Non-Latino


In [3]:
df_img_data_original_chexbert= pd.read_csv(data_dir + 'train_cheXbert.csv')
print("df_img_data_original_chexbert")
print(df_img_data_original_chexbert.shape)
df_img_data_original_chexbert.head()

df_img_data_original_chexbert
(223414, 19)


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,,,,,,,,,0.0,,,,1.0,1.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,1.0,,,-1.0,,,,,,1.0,,
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,1.0,,,-1.0,,,,,,1.0,,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,


In [4]:
# rename columns for consistency
df_img_data = df_img_data_original_chexbert
df_img_data = df_img_data.rename(columns={'Age': 'age'})
df_img_data = df_img_data.rename(columns={'Sex': 'sex'})
print(df_img_data.shape)
df_img_data.head()

(223414, 19)


Unnamed: 0,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,,,,,,,,,0.0,,,,1.0,1.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,1.0,,,-1.0,,,,,,1.0,,
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,1.0,,,-1.0,,,,,,1.0,,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,


In [5]:
# get patient ID
split =  df_img_data.Path.str.split("/", expand = True)
print(split.shape)
split.head()

(223414, 5)


Unnamed: 0,0,1,2,3,4
0,CheXpert-v1.0,train,patient00001,study1,view1_frontal.jpg
1,CheXpert-v1.0,train,patient00002,study2,view1_frontal.jpg
2,CheXpert-v1.0,train,patient00002,study1,view1_frontal.jpg
3,CheXpert-v1.0,train,patient00002,study1,view2_lateral.jpg
4,CheXpert-v1.0,train,patient00003,study1,view1_frontal.jpg


In [6]:
# add patient ID
df_img_data["patient_id"] = split[2]
print(df_img_data.shape)
df_img_data.head()

(223414, 20)


Unnamed: 0,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,patient_id
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,,,,,,,,,0.0,,,,1.0,1.0,patient00001
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,,patient00002
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,1.0,,,-1.0,,,,,,1.0,,,patient00002
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,1.0,,,-1.0,,,,,,1.0,,,patient00002
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,,patient00003


In [7]:
# merge chexpert demo and chexpert train based on patient id
df_cxr = df_demo.merge(df_img_data, on="patient_id")
print(df_cxr.shape)
df_cxr.head()

(222561, 22)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,-1.0,,,,,,,,1.0,
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,0.0,0.0,,,,-1.0,,,,
2,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view2...,Male,61,Lateral,,,,...,0.0,0.0,,,,-1.0,,,,
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,,,1.0,,,1.0,
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,1.0,,,1.0,,,,,1.0,


In [8]:
# original race value count
df_cxr.race.value_counts()

race
White                                        102402
Other                                         28095
White, non-Hispanic                           22154
Asian                                         20434
Unknown                                       15186
Black or African American                      9909
Race and Ethnicity Unknown                     8716
Other, Hispanic                                3621
Native Hawaiian or Other Pacific Islander      2809
Asian, non-Hispanic                            2793
Black, non-Hispanic                            2000
White, Hispanic                                 922
Other, non-Hispanic                             566
American Indian or Alaska Native                457
Patient Refused                                 405
Pacific Islander, non-Hispanic                  337
Native American, non-Hispanic                    55
Black, Hispanic                                  52
Asian, Hispanic                                  37
Native 

In [9]:
white = 'White'
asian = 'Asian'
black = 'Black'
# Only takes white, black and asian
mask = (df_cxr.race.str.contains("Black", na=False))
df_cxr.loc[mask, "race"] = black

mask = (df_cxr.race.str.contains("White", na=False))
df_cxr.loc[mask, "race"] = white

mask = (df_cxr.race.str.contains("Asian", na=False))
df_cxr.loc[mask, "race"] = asian

df_cxr['race'].unique()
print(df_cxr.shape)
df_cxr.head()

(222561, 22)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,-1.0,,,,,,,,1.0,
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,0.0,0.0,,,,-1.0,,,,
2,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view2...,Male,61,Lateral,,,,...,0.0,0.0,,,,-1.0,,,,
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,,,1.0,,,1.0,
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,1.0,,,1.0,,,,,1.0,


In [10]:
# After filtering out non asian, black and white
df_cxr = df_cxr[df_cxr.race.isin([asian,black,white])]
print(df_cxr.shape)
df_cxr.head()

(160724, 22)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,-1.0,,,,,,,,1.0,
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,0.0,0.0,,,,-1.0,,,,
2,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view2...,Male,61,Lateral,,,,...,0.0,0.0,,,,-1.0,,,,
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,,,1.0,,,1.0,
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,1.0,,,1.0,,,,,1.0,


In [11]:
# race value count among white, black and asian
df_cxr.race.value_counts()

race
White    125491
Asian     23272
Black     11961
Name: count, dtype: int64

In [12]:
# ethinicity value count 
df_cxr.ethnicity.value_counts()

ethnicity
Non-Hispanic/Non-Latino    149268
Unknown                      6546
Hispanic/Latino              4726
Patient Refused               160
Not Hispanic                   15
Hispanic                        1
Name: count, dtype: int64

In [13]:
# Filter out all hispanic / latino
df_cxr = df_cxr[df_cxr.ethnicity.isin(["Non-Hispanic/Non-Latino","Not Hispanic"])]
print(df_cxr.shape)
df_cxr.head()

(149283, 22)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,-1.0,,,,,,,,1.0,
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,0.0,0.0,,,,-1.0,,,,
2,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view2...,Male,61,Lateral,,,,...,0.0,0.0,,,,-1.0,,,,
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,,,1.0,,,1.0,
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,1.0,,,1.0,,,,,1.0,


In [14]:
# ethinicity value count after filtering
df_cxr.ethnicity.value_counts()

ethnicity
Non-Hispanic/Non-Latino    149268
Not Hispanic                   15
Name: count, dtype: int64

In [15]:
# only takes frontal images
df_cxr = df_cxr[df_cxr["Frontal/Lateral"]=="Frontal"]
print(df_cxr.shape)
df_cxr.head()

(127130, 22)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,-1.0,,,,,,,,1.0,
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,0.0,0.0,,,,-1.0,,,,
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,,,1.0,,,1.0,
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,1.0,,,1.0,,,,,1.0,
5,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study45/view1...,Male,61,Frontal,AP,,,...,1.0,,,,,1.0,,,1.0,


In [16]:
# assign race label
df_cxr['race_label'] = df_cxr['race']

df_cxr.loc[df_cxr['race_label'] == white, 'race_label'] = 0
df_cxr.loc[df_cxr['race_label'] == asian, 'race_label'] = 1
df_cxr.loc[df_cxr['race_label'] == black, 'race_label'] = 2
print(df_cxr.shape)
df_cxr.head()

(127130, 23)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,race_label
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,,,,,,,,1.0,,0
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,0.0,,,,-1.0,,,,,0
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,,1.0,,,1.0,,0
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,,,1.0,,,,,1.0,,0
5,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study45/view1...,Male,61,Frontal,AP,,,...,,,,,1.0,,,1.0,,0


In [17]:
# assign sex label
df_cxr['sex_label'] = df_cxr['sex']

df_cxr.loc[df_cxr['sex_label'] == 'Male', 'sex_label'] = 0
df_cxr.loc[df_cxr['sex_label'] == 'Female', 'sex_label'] = 1
print(df_cxr.shape)
df_cxr.head()

(127130, 24)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,race_label,sex_label
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,,,,,,,1.0,,0,0
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,,,,-1.0,,,,,0,0
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,,,1.0,,,1.0,,0,0
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,,1.0,,,,,1.0,,0,0
5,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study45/view1...,Male,61,Frontal,AP,,,...,,,,1.0,,,1.0,,0,0


In [18]:
labels = [
    'No Finding',
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices']

In [19]:

# assign disease label
df_cxr['disease'] = df_cxr[labels[0]]
df_cxr.loc[df_cxr[labels[0]] == 1, 'disease'] = labels[0]
df_cxr.loc[df_cxr[labels[10]] == 1, 'disease'] = labels[10]
df_cxr.loc[df_cxr['disease'].isna(), 'disease'] = 'Other'

df_cxr['disease_label'] = df_cxr['disease']
df_cxr.loc[df_cxr['disease_label'] == labels[0], 'disease_label'] = 0
df_cxr.loc[df_cxr['disease_label'] == labels[10], 'disease_label'] = 1
df_cxr.loc[df_cxr['disease_label'] == 'Other', 'disease_label'] = 2
print(df_cxr.shape)
df_cxr.head()


(127130, 26)


  df_cxr.loc[df_cxr[labels[0]] == 1, 'disease'] = labels[0]


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,race_label,sex_label,disease,disease_label
0,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study22/view1...,Male,59,Frontal,AP,,,...,,,,,1.0,,0,0,Other,2
1,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study39/view1...,Male,61,Frontal,PA,,,...,,-1.0,,,,,0,0,Other,2
3,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study61/view1...,Male,61,Frontal,AP,,,...,,1.0,,,1.0,,0,0,Pleural Effusion,1
4,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study35/view1...,Male,60,Frontal,AP,,,...,,,,,1.0,,0,0,Other,2
5,patient24428,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient24428/study45/view1...,Male,61,Frontal,AP,,,...,,1.0,,,1.0,,0,0,Pleural Effusion,1


In [20]:
# clean data
df_cxr = df_cxr[df_cxr["AP/PA"].isin(['AP','PA'])]
df_cxr = df_cxr.sort_values(by=['Path'])
df_cxr = df_cxr.reset_index()
df_cxr = df_cxr.drop(columns=['index'])
print(df_cxr.shape)
df_cxr.head()


(127118, 26)


Unnamed: 0,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,race_label,sex_label,disease,disease_label
0,patient00002,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,...,,,,1.0,,,0,1,Other,2
1,patient00002,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,...,,-1.0,,1.0,,,0,1,Other,2
2,patient00003,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,...,0.0,,,,,,0,0,Other,2
3,patient00004,Black,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00004/study1/view1_...,Female,20,Frontal,PA,0.0,,...,,0.0,,,,1.0,2,1,No Finding,0
4,patient00005,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00005/study1/view1_...,Male,33,Frontal,PA,1.0,0.0,...,,0.0,,,1.0,1.0,0,0,No Finding,0


In [21]:
df_cxr.race.value_counts()

race
White    99027
Asian    18830
Black     9261
Name: count, dtype: int64

In [22]:
df_data_split = pd.read_csv(data_dir + 'chexpert_split_2021_08_20_filtered.csv').set_index('index')
print(df_data_split.shape)
df_data_split.head()



(127118, 2)


Unnamed: 0_level_0,Unnamed: 0,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,train
1,1,train
2,2,train
3,3,train
4,4,train


In [23]:
df_cxr = pd.concat([df_cxr,df_data_split], axis=1)
df_cxr.head()

Unnamed: 0.1,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Pleural Other,Fracture,Support Devices,No Finding,race_label,sex_label,disease,disease_label,Unnamed: 0,split
0,patient00002,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,...,,1.0,,,0,1,Other,2,0,train
1,patient00002,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,...,,1.0,,,0,1,Other,2,1,train
2,patient00003,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,...,,,,,0,0,Other,2,2,train
3,patient00004,Black,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00004/study1/view1_...,Female,20,Frontal,PA,0.0,,...,,,,1.0,2,1,No Finding,0,3,train
4,patient00005,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00005/study1/view1_...,Male,33,Frontal,PA,1.0,0.0,...,,,1.0,1.0,0,0,No Finding,0,4,train


In [24]:
df_cxr.race.value_counts()

race
White    99027
Asian    18830
Black     9261
Name: count, dtype: int64

In [25]:
df_cxr.ethnicity.value_counts()

ethnicity
Non-Hispanic/Non-Latino    127105
Not Hispanic                   13
Name: count, dtype: int64

In [26]:
df_cxr.sex.value_counts()

sex
Male      74682
Female    52436
Name: count, dtype: int64

In [27]:
train_df = df_cxr[df_cxr.split=="train"]
validation_df = df_cxr[df_cxr.split=="validate"]
test_df = df_cxr[df_cxr.split=="test"]

In [28]:
#False indicates no patient_id shared between groups. Check if duplicated ID
import numpy as np
unique_train_id = train_df.patient_id.unique()
unique_validation_id = validation_df.patient_id.unique()
unique_test_id = test_df.patient_id.unique()
all_id = np.concatenate((unique_train_id, unique_validation_id, unique_test_id), axis=None)

def contains_duplicates(X):
    return len(np.unique(X)) != len(X)

contains_duplicates(all_id)

False

In [29]:
df_cxr.head()

Unnamed: 0.1,patient_id,race,ethnicity,Path,sex,age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,...,Pleural Other,Fracture,Support Devices,No Finding,race_label,sex_label,disease,disease_label,Unnamed: 0,split
0,patient00002,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,...,,1.0,,,0,1,Other,2,0,train
1,patient00002,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,...,,1.0,,,0,1,Other,2,1,train
2,patient00003,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,...,,,,,0,0,Other,2,2,train
3,patient00004,Black,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00004/study1/view1_...,Female,20,Frontal,PA,0.0,,...,,,,1.0,2,1,No Finding,0,3,train
4,patient00005,White,Non-Hispanic/Non-Latino,CheXpert-v1.0/train/patient00005/study1/view1_...,Male,33,Frontal,PA,1.0,0.0,...,,,1.0,1.0,0,0,No Finding,0,4,train


#ONLY UNCOMMENT IF YOU ARE CREATING THE DATASET FOR THE FIRST TIME

In [29]:

# # point to the parent directory that contains the folder 'CheXpert-v1.0'
# img_data_dir = '/vol/aimspace/projects/CheXpert/CheXpert/'

In [30]:
# df_cxr.to_csv(data_dir + 'chexpert.sample.csv')

In [31]:
# from skimage.io import imread
# from skimage.io import imsave
# from skimage.transform import resize

# df_cxr['path_preproc'] = df_cxr['Path']

# preproc_dir = 'preproc_224x224/'
# out_dir = img_data_dir

# if not os.path.exists(out_dir + preproc_dir):
#     os.makedirs(out_dir + preproc_dir)

# for idx, p in enumerate(tqdm(df_cxr['Path'])):

#     split =  p.split("/")
#     preproc_filename = split[2] + '_' + split[3] + '_' + split[4]
#     df_cxr.loc[idx, 'path_preproc'] = preproc_dir + preproc_filename
#     out_path = out_dir + preproc_dir + preproc_filename
    
#     if not os.path.exists(out_path):
#         image = imread(img_data_dir + p)
#         image = resize(image, output_shape=(224, 224), preserve_range=True)
#         imsave(out_path, image.astype(np.uint8))

100%|██████████| 127118/127118 [00:23<00:00, 5517.34it/s]


In [32]:
# df_train = df_cxr[df_cxr.split=="train"]
# df_val = df_cxr[df_cxr.split=="validate"]
# df_test = df_cxr[df_cxr.split=="test"]

In [33]:
# df_train.to_csv(data_dir + 'chexpert.sample.train.csv')
# df_val.to_csv(data_dir + 'chexpert.sample.val.csv')
# df_test.to_csv(data_dir + 'chexpert.sample.test.csv')

In [34]:
# white = 'White'
# asian = 'Asian'
# black = 'Black'

# df_train_white = df_train[df_train['race'] == white]
# df_val_white = df_val[df_val['race'] == white]
# df_train_white.to_csv(data_dir + 'chexpert.sample.train.white.csv')
# df_val_white.to_csv(data_dir + 'chexpert.sample.val.white.csv')

# df_cxr_white = df_cxr[df_cxr['race'] == white]
# df_cxr_asian = df_cxr[df_cxr['race'] == asian]
# df_cxr_black = df_cxr[df_cxr['race'] == black]

# df_test_white = df_test[df_test['race'] == white]
# df_test_asian = df_test[df_test['race'] == asian]
# df_test_black = df_test[df_test['race'] == black]

# df_train_asian = df_train[df_train['race'] == asian]
# df_train_black = df_train[df_train['race'] == black]

# df_val_asian = df_val[df_val['race'] == asian]
# df_val_black = df_val[df_val['race'] == black]

In [35]:
# df_train_male = df_train[df_train['sex'] == 'Male']
# df_val_male = df_val[df_val['sex'] == 'Male']
# df_train_male.to_csv(data_dir + 'chexpert.sample.train.male.csv')
# df_val_male.to_csv(data_dir + 'chexpert.sample.val.male.csv')