In [1]:
data_path = '../data/'
eicu_path = '/data4/tangsp/eicu-2.0/'

import pandas as pd
import numpy as np
import itertools
from collections import Counter

In [2]:
icustays = pd.read_csv(data_path + 'icustays.csv')
partition = icustays.set_index('ICUStayID')[['partition']]
tasks = ['ARF', 'Shock']
Ts = [4, 12]

populations = {}
for task, T in itertools.product(tasks, Ts):
    pop = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, T))
    populations[task, T] = pop.set_index('ID')[['{}_LABEL'.format(task)]]

populations['mortality', 48] = pd.read_csv(data_path + 'population/{}_{}h.csv'.format('mortality', 48)) \
                                 .set_index('ID')[['{}_LABEL'.format('mortality')]]

In [3]:
df_out = []
for (task, T), labels in populations.items():
    df = labels.join(partition)
    c = Counter(df['partition'])
    frac = df.groupby('partition').mean()['{}_LABEL'.format(task)]
    df_out.append([task, T, 
                   len(df),    df['{}_LABEL'.format(task)].mean(),
                   c['train'], frac['train'], 
                   c['val'],   frac['val'], 
                   c['test'],  frac['test']])

In [4]:
df_out = pd.DataFrame(df_out, columns=['task', 'T', 'TOTAL_N', 'TOTAL_%', 'train_N', 'train_%', 'val_N', 'val_%', 'test_N', 'test_%'])

In [5]:
df_out.sort_values(['task', 'T']).reset_index(drop=True)

Unnamed: 0,task,T,TOTAL_N,TOTAL_%,train_N,train_%,val_N,val_%,test_N,test_%
0,ARF,4,138840,0.068619,97199,0.069116,20892,0.068591,20749,0.066316
1,ARF,12,122619,0.056614,85845,0.056882,18499,0.057138,18275,0.054829
2,Shock,4,164333,0.075749,115070,0.076762,24616,0.073976,24647,0.072788
3,Shock,12,144725,0.049425,101393,0.050743,21690,0.046381,21642,0.046299
4,mortality,48,77066,0.114953,53842,0.114799,11682,0.120442,11542,0.11012


In [6]:
patients = pd.read_csv(eicu_path + 'patient.csv').rename(columns={'patientunitstayid': 'ICUStayID'}) \
             .sort_values(by='ICUStayID').set_index('ICUStayID')
patients.loc[patients['age'] == '> 89', 'age'] = 300
patients['age'] = patients['age'].astype(float)

In [7]:
IDs_dict = {}
for (task, T), labels in populations.items():
    IDs_dict[task, T] = list(labels.index)

In [8]:
len(IDs_dict['ARF', 4])

138840

In [9]:
def table_one(IDs, task, T):
    info = {'Population': '{}, {}'.format(task, T)}
    N = len(IDs)
    info['N'] = N

    df_pat = patients.reindex(IDs)

    c_sex = Counter(df_pat['gender'])
    info['Sex: M'] = c_sex['Male']
    info['Sex: F'] = c_sex['Female']
    info['Sex: M%'] = c_sex['Male'] / N
    info['Sex: F%'] = c_sex['Female'] / N

    ages = df_pat['age']
    info['Age_median'] = ages.median()
    info['Age_Q1'] = ages.quantile(0.25)
    info['Age_Q3'] = ages.quantile(0.75)

    los = df_pat['unitdischargeoffset']
    info['LOS_median'] = los.median() / 60
    info['LOS_Q1'] = los.quantile(0.25) / 60
    info['LOS_Q3'] = los.quantile(0.75) / 60

    c_icu = Counter(df_pat['unittype'])
    for icu, n in c_icu.items():
        info['ICU type: {}'.format(icu)] = n
        info['ICU type: {}%'.format(icu)] = n/N

    assert (IDs == populations[task, T].index).all()
    info['Outcome'] = populations[task, T].sum().values[0]
    info['Outcome%'] = populations[task, T].mean().values[0]

    hosp = (df_pat['unitdischargestatus'] == 'Expired').astype(int)
    info['In-hospital death'] = sum(hosp)
    info['In-hospital death %'] = sum(hosp)/N
    
    races = df_pat['ethnicity'].fillna('Other/Unknown')
#     {
#         'Asian': 'Asian',
#         'Caucasian': 'WHITE',
#         'African American': 'AFRICAN AMERICAN/BLACK',
#         'Hispanic': 'HISPANIC/LATINO',
#         'Native American': 'Native American', 
#         'Other/Unknown': 'other',
#         np.nan: ,
#     })
    c_race = Counter(races)
    for r, n in c_race.items():
        info['Race: {}'.format(r)] = n
        info['Race: {} %'.format(r)] = n/N
    
    return info

In [10]:
all_info = [
    table_one(IDs_dict['mortality', 48], 'mortality', 48),
    
    table_one(IDs_dict['ARF', 4], 'ARF', 4),
    table_one(IDs_dict['ARF', 12], 'ARF', 12),
    table_one(IDs_dict['Shock', 4], 'Shock', 4),
    table_one(IDs_dict['Shock', 12], 'Shock', 12),
]

In [15]:
df_TableOne = pd.DataFrame(all_info).set_index('Population').T.astype(object)

In [16]:
for name in df_TableOne.index:
    if '%' not in name:
        df_TableOne.loc[name] = df_TableOne.loc[name].astype(int)

In [17]:
df_TableOne = df_TableOne.reindex([
    'N',
    'Age_median', 'Age_Q1', 'Age_Q3', 
    'Sex: F', 'Sex: F%', #'Sex: M', 'Sex: M%',
    
    'Race: African American', 'Race: Asian', 'Race: Caucasian', 'Race: Hispanic', 'Race: Native American', 'Race: Other/Unknown', 
    'Race: African American %', 'Race: Asian %', 'Race: Caucasian %', 'Race: Hispanic %', 'Race: Native American %', 'Race: Other/Unknown %', 

    'ICU type: CCU-CTICU', 'ICU type: CSICU', 'ICU type: CTICU', 'ICU type: Cardiac ICU', 'ICU type: MICU', 
    'ICU type: Med-Surg ICU', 'ICU type: Neuro ICU', 'ICU type: SICU', 
    'ICU type: CCU-CTICU%', 'ICU type: CSICU%', 'ICU type: CTICU%', 'ICU type: Cardiac ICU%', 'ICU type: MICU%', 
    'ICU type: Med-Surg ICU%', 'ICU type: Neuro ICU%', 'ICU type: SICU%', 

    'Outcome', 'Outcome%', 
    'In-hospital death', 'In-hospital death %', 
    'LOS_median', 'LOS_Q1', 'LOS_Q3', 
    
])

In [18]:
with pd.option_context('float_format', '{:.1%}'.format):
    display(df_TableOne)

Population,"mortality, 48","ARF, 4","ARF, 12","Shock, 4","Shock, 12"
N,77066,138840,122619,164333,144725
Age_median,66,65,65,65,65
Age_Q1,54,53,53,52,52
Age_Q3,77,77,77,76,76
Sex: F,34920,64950,57434,76229,67193
Sex: F%,45.3%,46.8%,46.8%,46.4%,46.4%
Race: African American,9083,13925,12463,18223,16572
Race: Asian,1270,2317,2045,2546,2262
Race: Caucasian,59191,107957,95416,126621,111190
Race: Hispanic,2853,5527,4919,6127,5440
