In [1]:
import yaml
with open('../config.yaml') as f:
    config = yaml.full_load(f)

data_path = config['data_path']
mimic3_path = config['mimic3_path']

import pandas as pd
import itertools
from collections import Counter

In [2]:
icustays = pd.read_csv(data_path + 'prep/icustays_MV.csv')
partition = icustays.set_index('ICUSTAY_ID')[['partition']]
tasks = ['ARF', 'Shock']
Ts = [4, 12]

populations = {}
for task, T in itertools.product(tasks, Ts):
    pop = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, T))
    populations[task, T] = pop.set_index('ICUSTAY_ID')[['{}_LABEL'.format(task)]]

populations['mortality', 48] = pd.read_csv(data_path + 'population/pop.mortality_benchmark.csv'.format('mortality', 48)) \
                                 .set_index('ID')[['{}_LABEL'.format('mortality')]]

In [3]:
df_out = []
for (task, T), labels in populations.items():
    df = labels.join(partition)
    c = Counter(df['partition'])
    frac = df.groupby('partition').mean()['{}_LABEL'.format(task)]
    df_out.append([task, T, 
                   len(df),    df['{}_LABEL'.format(task)].mean(),
                   c['train'], frac['train'], 
                   c['val'],   frac['val'], 
                   c['test'],  frac['test']])

In [4]:
df_out = pd.DataFrame(df_out, columns=['task', 'T', 'TOTAL_N', 'TOTAL_%', 'train_N', 'train_%', 'val_N', 'val_%', 'test_N', 'test_%'])

In [5]:
df_out.sort_values(['task', 'T']).reset_index(drop=True)

Unnamed: 0,task,T,TOTAL_N,TOTAL_%,train_N,train_%,val_N,val_%,test_N,test_%
0,ARF,4,15873,0.1827,11147,0.182291,2368,0.180743,2358,0.186599
1,ARF,12,14174,0.096515,9971,0.097282,2110,0.093365,2093,0.096034
2,Shock,4,19342,0.149416,13613,0.148241,2862,0.157582,2867,0.146843
3,Shock,12,17588,0.077553,12381,0.075923,2595,0.084008,2612,0.078867
4,mortality,48,8577,0.120205,6048,0.119544,1272,0.127358,1257,0.11615


In [6]:
examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID') # Only Metavision
patients = pd.read_csv(mimic3_path + 'PATIENTS.csv', parse_dates=['DOB', 'DOD']) \
             .merge(examples, on=['SUBJECT_ID'], how='right')
admissions = pd.read_csv(mimic3_path + 'ADMISSIONS.csv', parse_dates=['DEATHTIME', 'HOSPITAL_EXPIRE_FLAG']) \
               .merge(examples, on=['SUBJECT_ID', 'HADM_ID'], how='right')

examples = examples.set_index('ICUSTAY_ID')
patients = patients.set_index('ICUSTAY_ID')
admissions = admissions.set_index('ICUSTAY_ID')

In [7]:
patients['AGE'] = patients.apply(lambda x: (x['INTIME'] - x['DOB']).total_seconds(), axis=1) / 3600 / 24 / 365.25

In [8]:
IDs_dict = {}
for (task, T), labels in populations.items():
    IDs_dict[task, T] = list(labels.index)

In [9]:
len(IDs_dict['ARF', 4])

15873

In [10]:
def table_one(IDs, task, T):
    info = {'Population': '{}, {}'.format(task, T)}
    N = len(IDs)
    info['N'] = N

    df_pat = patients.reindex(IDs)
    df_adm = admissions.reindex(IDs)
    df_exa = examples.reindex(IDs)

    c_sex = Counter(df_pat['GENDER'])
    info['Sex: M'] = c_sex['M']
    info['Sex: F'] = c_sex['F']
    info['Sex: M%'] = c_sex['M'] / N
    info['Sex: F%'] = c_sex['F'] / N

    ages = df_pat['AGE']
    info['Age_median'] = ages.median()
    info['Age_Q1'] = ages.quantile(0.25)
    info['Age_Q3'] = ages.quantile(0.75)

    los = df_exa['LOS']
    info['LOS_median'] = los.median() * 24
    info['LOS_Q1'] = los.quantile(0.25) * 24
    info['LOS_Q3'] = los.quantile(0.75) * 24

    c_icu = Counter(df_exa['FIRST_CAREUNIT'])
    for icu, n in c_icu.items():
        info['ICU type: {}'.format(icu)] = n
        info['ICU type: {}%'.format(icu)] = n/N

    assert (IDs == populations[task, T].index).all()
    info['Outcome'] = populations[task, T].sum().values[0]
    info['Outcome%'] = populations[task, T].mean().values[0]

    hosp = df_adm['HOSPITAL_EXPIRE_FLAG'].astype(int)
    info['In-hospital death'] = sum(hosp)
    info['In-hospital death %'] = sum(hosp)/N

    races = df_adm['ETHNICITY'].map({
        'ASIAN': 'ASIAN',
        'ASIAN - ASIAN INDIAN': 'ASIAN',
        'ASIAN - CAMBODIAN': 'ASIAN',
        'ASIAN - CHINESE': 'ASIAN',
        'ASIAN - FILIPINO': 'ASIAN',
        'ASIAN - JAPANESE': 'ASIAN',
        'ASIAN - KOREAN': 'ASIAN',
        'ASIAN - OTHER': 'ASIAN',
        'ASIAN - THAI': 'ASIAN',
        'ASIAN - VIETNAMESE': 'ASIAN',

        'WHITE': 'WHITE',
        'WHITE - BRAZILIAN': 'WHITE',
        'WHITE - EASTERN EUROPEAN': 'WHITE',
        'WHITE - OTHER EUROPEAN': 'WHITE',
        'WHITE - RUSSIAN': 'WHITE',

        'BLACK/AFRICAN': 'AFRICAN AMERICAN/BLACK',
        'BLACK/AFRICAN AMERICAN': 'AFRICAN AMERICAN/BLACK',
        'BLACK/CAPE VERDEAN': 'AFRICAN AMERICAN/BLACK',
        'BLACK/HAITIAN': 'AFRICAN AMERICAN/BLACK',

        'HISPANIC OR LATINO': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - COLOMBIAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - CUBAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - DOMINICAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - GUATEMALAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - HONDURAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - MEXICAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - PUERTO RICAN': 'HISPANIC/LATINO',
        'HISPANIC/LATINO - SALVADORAN': 'HISPANIC/LATINO',

        'MIDDLE EASTERN': 'other',
        'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'other',
        'PORTUGUESE': 'other',
        'SOUTH AMERICAN': 'other',

        'CARIBBEAN ISLAND': 'other',
        'AMERICAN INDIAN/ALASKA NATIVE': 'other',
        'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE': 'other',

        'PATIENT DECLINED TO ANSWER': 'other',
        'MULTI RACE ETHNICITY': 'other',
        'UNABLE TO OBTAIN': 'other',
        'OTHER': 'other',
        'UNKNOWN/NOT SPECIFIED': 'other',
    })
    c_race = Counter(races)
    for r, n in c_race.items():
        info['Race: {}'.format(r)] = n
        info['Race: {} %'.format(r)] = n/N
    
    return info

In [11]:
all_info = [
    table_one(IDs_dict['mortality', 48], 'mortality', 48),
    
    table_one(IDs_dict['ARF', 4], 'ARF', 4),
    table_one(IDs_dict['ARF', 12], 'ARF', 12),
    table_one(IDs_dict['Shock', 4], 'Shock', 4),
    table_one(IDs_dict['Shock', 12], 'Shock', 12),
]

In [12]:
df_TableOne = pd.DataFrame(all_info).set_index('Population').T.astype(object)

In [13]:
for name in df_TableOne.index:
    if '%' not in name:
        df_TableOne.loc[name] = df_TableOne.loc[name].astype(int)

In [14]:
df_TableOne = df_TableOne.reindex([
    'N',
    'Age_median', 'Age_Q1', 'Age_Q3', 
    'Sex: F', 'Sex: F%', 'Sex: M', 'Sex: M%',
    
    'Race: AFRICAN AMERICAN/BLACK','Race: ASIAN', 'Race: HISPANIC/LATINO', 'Race: WHITE', 'Race: other', 
#     'Race: AFRICAN AMERICAN/BLACK %', 'Race: ASIAN %', 'Race: HISPANIC/LATINO %', 'Race: WHITE %', 'Race: other %',
    
    'ICU type: CCU', 'ICU type: CSRU', 'ICU type: MICU', 'ICU type: SICU', 'ICU type: TSICU', 
#     'ICU type: CCU%','ICU type: CSRU%', 'ICU type: MICU%', 'ICU type: SICU%', 'ICU type: TSICU%', 
    
    'Outcome', 'Outcome%', 
    'In-hospital death', 'In-hospital death %', 
    'LOS_median', 'LOS_Q1', 'LOS_Q3', 
    
])

In [15]:
with pd.option_context('float_format', '{:.1%}'.format):
    display(df_TableOne)

Population,"mortality, 48","ARF, 4","ARF, 12","Shock, 4","Shock, 12"
N,8577,15873,14174,19342,17588
Age_median,66,65,65,64,64
Age_Q1,54,53,52,52,52
Age_Q3,78,78,78,77,77
Sex: F,3915,7202,6545,8603,7934
Sex: F%,45.6%,45.4%,46.2%,44.5%,45.1%
Sex: M,4662,8671,7629,10739,9654
Sex: M%,54.4%,54.6%,53.8%,55.5%,54.9%
Race: AFRICAN AMERICAN/BLACK,877,1895,1760,2205,2083
Race: ASIAN,227,440,388,527,468
