In [1]:
import pandas as pd 
import psycopg2
import getpass

from helpers import *

## Define the mortality cohort

In [2]:
demographics = pd.read_hdf('data.h5', key='demographics') 
labs_vitals = pd.read_hdf('data.h5', key='labs_vitals') 
vaso_mv = pd.read_hdf('data.h5', key='vaso_mv') 
vaso_cv = pd.read_hdf('data.h5', key='vaso_cv') 
mech_vent = pd.read_hdf('data.h5', key='mech_vent') 
colloid_bolus = pd.read_hdf('data.h5', key='colloid_bolus_f') 
crystalloid_bolus = pd.read_hdf('data.h5', key='crystalloid_bolus')
antibiotics = pd.read_hdf('data.h5', key='antibiotics')
diagnoses = pd.read_hdf('data.h5', key='diagnoses') 


In [3]:
labs_vitals

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,4,185777,294638,2191-03-16 08:00:00,646,100,%
1,165,170252,247247,2170-10-03 18:00:00,223761,97.3,?F
2,165,170252,247247,2170-10-03 18:35:00,220277,99,%
3,165,170252,247247,2170-10-03 19:00:00,220277,99,%
4,165,170252,247247,2170-10-03 19:00:00,220339,5,cmH2O
...,...,...,...,...,...,...,...
466924,786,117381,234784,2116-01-05 06:00:00,51249,32.9,%
466925,786,117381,234784,2116-01-05 06:00:00,51250,92,fL
466926,786,117381,234784,2116-01-05 06:00:00,51265,128,K/uL
466927,786,117381,234784,2116-01-05 06:00:00,51279,3.46,m/uL


In [4]:
# include patients only on the 1st icu stay, older than 18 and with a length of stay from 1 to 10 days 
print("Number of patients hospitalized: ", len(set((demographics.subject_id))))


Number of patients hospitalized:  512


In [5]:
print("Number of patients with in hospital mortality: ", demographics.mort_hosp.value_counts())
print("Number of patients with in icu mortality: ", demographics.mort_icu.value_counts())

Number of patients with in hospital mortality:  0    468
1     44
Name: mort_hosp, dtype: int64
Number of patients with in icu mortality:  0    483
1     29
Name: mort_icu, dtype: int64


demographics.hadm_id.value_counts().sort_values()

In [6]:
subject_id_set = set(demographics['subject_id'])
hadm_id_set = set(demographics['hadm_id'])
icustay_id_set = set(demographics['icustay_id'])

## Labs and vitals 

In [7]:

var_map = get_variable_mapping("resources\itemid_to_variable_map.csv")
var_map

Unnamed: 0_level_0,LEVEL2,LEVEL1,LINKSTO
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50861,Alanine aminotransferase,Alanine aminotransferase,labevents
769,Alanine aminotransferase,Alanine aminotransferase,chartevents
220644,Alanine aminotransferase,Alanine aminotransferase,chartevents
50862,Albumin,Albumin,labevents
772,Albumin,Albumin,chartevents
...,...,...,...
40473,Urine output,Urine output,outputevents
40715,Urine output,Urine output,outputevents
43175,Urine output,Urine output,outputevents
226559,Urine output,Urine output (foley),outputevents


In [8]:
chartitems_to_keep = set(var_map.loc[var_map['LINKSTO'] == 'chartevents'].index)
#chartitems_to_keep = set([ str(i) for i in chartitems_to_keep ])

labitems_to_keep = set(var_map.loc[var_map['LINKSTO'] == 'labevents'].index)


### Preprocess labs and vitals 

#### hourly buckets

In [9]:
# the value is str, convert to numeric
labs_vitals['value'] = pd.to_numeric(labs_vitals['value'], 'coerce')
#ID_COLS = ['subject_id', 'hadm_id', 'icustay_id']
#ID_COLS = ['subject_id', 'hadm_id']
#ITEM_COLS = ['itemid', 'label', 'LEVEL1', 'LEVEL2']
#X = X.astype({k: int for k in ID_COLS})


# join and add in labs_vital the icu intime and outime, to separate in hourly buckets
demographics = demographics.set_index("icustay_id")
labs_vitals = labs_vitals.set_index('icustay_id').join(demographics[['intime', 'outtime']])


to_hours = lambda x: max(0, x.days*24 + x.seconds // 3600)

#divide into hourly buckers from intime to chartime 
#labs_vitals['hourly_buckets'] = (labs_vitals['charttime']-labs_vitals['intime'])/pd.Timedelta(minutes=60)
#labs_vitals['hourly_buckets'] = (labs_vitals['charttime']-labs_vitals['intime'])/3600
#labs_vitals['hourly_buckets'] = labs_vitals['hourly_buckets'].apply(lambda x: np.max(x))

labs_vitals['hours_in'] = (labs_vitals['charttime'] - labs_vitals['intime']).apply(to_hours)

In [10]:
labs_vitals.hours_in.min()

0

#### aggregate same item_ids according to the extract_mimic_paper

In [11]:
labs_vitals = labs_vitals.sort_values(by=['subject_id', 'charttime'])

# to filter the itemids table
itemids = set(labs_vitals.itemid.astype(str))

In [12]:
labs_vitals_copy = labs_vitals.copy(deep=True)

In [13]:


connection = psycopg2.connect(
    user = 'postgres',
    database="mimic",
    password=getpass.getpass("Enter postgres password"), 
    host="127.0.0.1", 
    port = "5433",
    options=f'-c search_path=mimiciii')




query_d_items = \
"""
SELECT itemid, label, dbsource, linksto, category, unitname
FROM d_items
WHERE itemid in ({itemids})
;
""".format(itemids=','.join(itemids))
items_ids = pd.read_sql_query(query_d_items, connection).set_index('itemid')


In [14]:
labs_vitals.drop(columns=['charttime', 'intime', 'outtime'], inplace=True)
labs_vitals.set_index('itemid', append=True, inplace=True)

#labs_vitals = labs_vitals.join(var_map).join(items_ids).set_index(['label', 'LEVEL1', 'LEVEL2'], append=True)
#standardize_units(X, name_col='LEVEL1', inplace=True)
#X = apply_variable_limits(X, var_ranges, 'LEVEL2')
labs_vitals = labs_vitals.join(var_map).join(items_ids).set_index(['label', 'LEVEL2'], append=True)

In [15]:
labs_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,subject_id,hadm_id,value,valueuom,hours_in,LEVEL1,LINKSTO,dbsource,linksto,category,unitname
icustay_id,itemid,label,LEVEL2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
211552,50868,,Anion gap,3,145834,17.0,mEq/L,0,Anion gap,labevents,,,,
211552,50882,,Bicarbonate,3,145834,25.0,mEq/L,0,Bicarbonate,labevents,,,,
211552,50893,,Calcium,3,145834,8.2,mg/dL,0,Calcium (total),labevents,,,,
211552,50902,,Chloride,3,145834,99.0,mEq/L,0,Chloride,labevents,,,,
211552,50912,,Creatinine,3,145834,3.2,mg/dL,0,Creatinine,labevents,,,,


In [16]:
# get the names of the aggregated labs and vitals
labs_vitals_names = list(labs_vitals.index.get_level_values('LEVEL2'))
labs_vitals = labs_vitals.groupby(['subject_id', 'hadm_id', 'icustay_id'] + ['LEVEL2'] + ['hours_in']).agg(['mean', 'std', 'count'])

In [17]:
labs_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,count
subject_id,hadm_id,icustay_id,LEVEL2,hours_in,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,145834,211552,Alanine aminotransferase,0,25.0,0.0,2
3,145834,211552,Alanine aminotransferase,32,20.0,0.0,2
3,145834,211552,Albumin,0,1.8,0.0,2
3,145834,211552,Alkaline phosphate,0,73.0,0.0,2
3,145834,211552,Alkaline phosphate,32,89.0,0.0,2


In [18]:
labs_vitals.columns = labs_vitals.columns.droplevel(0)

labs_vitals.columns.names = ['Aggregated']

demographics['max_hours'] = (demographics['outtime'] - demographics['intime']).apply(to_hours)

In [19]:
labs_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Aggregated,mean,std,count
subject_id,hadm_id,icustay_id,LEVEL2,hours_in,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,Alanine aminotransferase,0,25.0,0.0,2
3,145834,211552,Alanine aminotransferase,32,20.0,0.0,2
3,145834,211552,Albumin,0,1.8,0.0,2
3,145834,211552,Alkaline phosphate,0,73.0,0.0,2
3,145834,211552,Alkaline phosphate,32,89.0,0.0,2


In [20]:
demographics.head()

Unnamed: 0_level_0,subject_id,hadm_id,dob,dod,gender,ethnicity,diagnosis,intime,icu_length_of_stay,age,death_after_icu_hours,hospital_expire_flag,outtime,mort_icu,mort_hosp,max_hours
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
211552,3,145834,2025-04-11,2102-06-14,M,WHITE,HYPOTENSION,2101-10-20 19:10:11,6.0,76.734698,,0,2101-10-26 20:43:09,0,0,145
294638,4,185777,2143-05-12,NaT,F,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-16 00:29:31,1.0,47.976345,,0,2191-03-17 16:46:31,0,0,40
228232,6,107064,2109-06-21,NaT,F,WHITE,CHRONIC RENAL FAILURE/SDA,2175-05-30 21:30:54,4.0,66.120876,,0,2175-06-03 13:39:54,0,0,88
220597,9,150750,2108-01-26,2149-11-14,M,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,2149-11-09 13:07:02,5.0,41.90346,5.0,1,2149-11-14 20:52:14,1,1,127
229441,11,194540,2128-02-22,2178-11-14,F,WHITE,BRAIN MASS,2178-04-16 06:19:32,1.0,50.28525,,0,2178-04-17 20:21:05,0,0,38


#### Pivot the table

In [21]:
#missing_hours_fill = range_unnest(demographics, 'max_hours', out_col_name='hours_in', reset_index=True)
#missing_hours_fill['tmp'] = np.NaN


#fill_df = demographics.reset_index()[['subject_id', 'hadm_id', 'icustay_id']].join(missing_hours_fill.set_index('icustay_id'), on='icustay_id')
#fill_df.set_index(['subject_id', 'hadm_id', 'icustay_id'] + ['hours_in'], inplace=True)

# Pivot table droups NaN columns so you lose any uniformly NaN.
labs_vitals = labs_vitals.unstack(level = ['LEVEL2'])
labs_vitals.columns = labs_vitals.columns.reorder_levels(order=['LEVEL2'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


labs_vitals = labs_vitals.sort_index(axis=0).sort_index(axis=1)


#### Drop columns few recordings

In [22]:
threshold = 0.9
columns_to_drop = []
for column_name in labs_vitals.columns:
    #print(column_name)
    if column_name[1] == 'mean':
        #get the only the means of the vital_labs
        if labs_vitals[column_name].isnull().mean() > threshold:
            columns_to_drop.append(column_name[0])

labs_vitals_red = labs_vitals.drop(columns=columns_to_drop)

#### Apply variable limits - outliers

## Vasopressors 

In [58]:
# add the remaining patients that did not have vasopressors with label in has_vasopressors = 0 
demographics.reset_index(inplace=True)
patients_missing_vaso = demographics[demographics.subject_id.isin(vaso_cv.subject_id)][['subject_id', 'hadm_id', 'icustay_id']]

patients_missing_vaso.head()

Unnamed: 0,subject_id,hadm_id,icustay_id
0,3,145834,211552
3,9,150750,220597
6,13,143045,263738
7,17,194023,277042
10,20,157681,264490


In [59]:
demographics.head()

Unnamed: 0,icustay_id,subject_id,hadm_id,dob,dod,gender,ethnicity,diagnosis,intime,icu_length_of_stay,age,death_after_icu_hours,hospital_expire_flag,outtime,mort_icu,mort_hosp,max_hours
0,211552,3,145834,2025-04-11,2102-06-14,M,WHITE,HYPOTENSION,2101-10-20 19:10:11,6.0,76.734698,,0,2101-10-26 20:43:09,0,0,145
1,294638,4,185777,2143-05-12,NaT,F,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-16 00:29:31,1.0,47.976345,,0,2191-03-17 16:46:31,0,0,40
2,228232,6,107064,2109-06-21,NaT,F,WHITE,CHRONIC RENAL FAILURE/SDA,2175-05-30 21:30:54,4.0,66.120876,,0,2175-06-03 13:39:54,0,0,88
3,220597,9,150750,2108-01-26,2149-11-14,M,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,2149-11-09 13:07:02,5.0,41.90346,5.0,1,2149-11-14 20:52:14,1,1,127
4,229441,11,194540,2128-02-22,2178-11-14,F,WHITE,BRAIN MASS,2178-04-16 06:19:32,1.0,50.28525,,0,2178-04-17 20:21:05,0,0,38


In [60]:
vaso_cv_wip = vaso_cv.append(patients_missing_vaso)

In [61]:
vaso_cv_wip

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,rate,rate_std,has_vasopressors
0,603,104325.0,200168,30128.0,2112-09-02 17:00:00,0.3,0.135,1.0
1,603,104325.0,200168,30128.0,2112-09-02 17:15:00,0.0,0.000,1.0
2,603,104325.0,200168,30128.0,2112-09-02 17:30:00,0.0,0.000,1.0
3,603,104325.0,200168,30128.0,2112-09-02 17:45:00,0.0,0.000,1.0
4,603,104325.0,200168,30128.0,2112-09-02 18:00:00,0.0,0.000,1.0
...,...,...,...,...,...,...,...,...
493,752,140333.0,234921,,NaT,,,
503,772,186398.0,252775,,NaT,,,
505,776,102794.0,220142,,NaT,,,
509,782,125662.0,264637,,NaT,,,


In [62]:
vaso_cv_wip = vaso_cv_wip.sort_values(["subject_id", "charttime"])
#vaso_cv_wip.reset_index(drop=True)
vaso_cv_wip

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,rate,rate_std,has_vasopressors
710,3,145834.0,211552,30043.0,2101-10-20 21:00:00,20.000,0.200,1.0
740,3,145834.0,211552,30128.0,2101-10-20 21:00:00,0.616,0.277,1.0
711,3,145834.0,211552,30043.0,2101-10-20 21:30:00,10.000,0.100,1.0
712,3,145834.0,211552,30043.0,2101-10-20 22:00:00,4.000,0.040,1.0
741,3,145834.0,211552,30128.0,2101-10-20 22:00:00,0.620,0.279,1.0
...,...,...,...,...,...,...,...,...
2915,785,192508.0,228499,30128.0,2144-03-01 06:00:00,0.000,0.000,1.0
2916,785,192508.0,228499,30128.0,2144-03-01 07:00:00,0.500,0.225,1.0
2917,785,192508.0,228499,30128.0,2144-03-01 07:30:00,0.000,0.000,1.0
2918,785,192508.0,228499,30128.0,2144-03-01 08:00:00,0.000,0.000,1.0


In [63]:
vaso_cv_wip.has_vasopressors.isna().value_counts()

False    8421
True      178
Name: has_vasopressors, dtype: int64

In [64]:
vaso_cv_wip['has_vasopressors'] = vaso_cv_wip['has_vasopressors'].fillna(0)
vaso_cv_wip

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,rate,rate_std,has_vasopressors
710,3,145834.0,211552,30043.0,2101-10-20 21:00:00,20.000,0.200,1.0
740,3,145834.0,211552,30128.0,2101-10-20 21:00:00,0.616,0.277,1.0
711,3,145834.0,211552,30043.0,2101-10-20 21:30:00,10.000,0.100,1.0
712,3,145834.0,211552,30043.0,2101-10-20 22:00:00,4.000,0.040,1.0
741,3,145834.0,211552,30128.0,2101-10-20 22:00:00,0.620,0.279,1.0
...,...,...,...,...,...,...,...,...
2915,785,192508.0,228499,30128.0,2144-03-01 06:00:00,0.000,0.000,1.0
2916,785,192508.0,228499,30128.0,2144-03-01 07:00:00,0.500,0.225,1.0
2917,785,192508.0,228499,30128.0,2144-03-01 07:30:00,0.000,0.000,1.0
2918,785,192508.0,228499,30128.0,2144-03-01 08:00:00,0.000,0.000,1.0


In [65]:
demographics = demographics.set_index('icustay_id')
vaso_cv_wip = vaso_cv_wip.set_index('icustay_id').join(demographics[['intime', 'outtime']])
vaso_cv_wip['hours_in'] = (vaso_cv_wip['charttime'] - vaso_cv_wip['intime']).apply(to_hours)

In [66]:
vaso_cv_wip = vaso_cv_wip.sort_values(["subject_id", "charttime"])
vaso_cv_wip

Unnamed: 0_level_0,subject_id,hadm_id,itemid,charttime,rate,rate_std,has_vasopressors,intime,outtime,hours_in
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
211552,3,145834.0,30043.0,2101-10-20 21:00:00,20.000,0.200,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,1
211552,3,145834.0,30128.0,2101-10-20 21:00:00,0.616,0.277,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,1
211552,3,145834.0,30043.0,2101-10-20 21:30:00,10.000,0.100,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,2
211552,3,145834.0,30043.0,2101-10-20 22:00:00,4.000,0.040,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,2
211552,3,145834.0,30128.0,2101-10-20 22:00:00,0.620,0.279,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,2
...,...,...,...,...,...,...,...,...,...,...
228499,785,192508.0,30128.0,2144-03-01 06:00:00,0.000,0.000,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,115
228499,785,192508.0,30128.0,2144-03-01 07:00:00,0.500,0.225,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,116
228499,785,192508.0,30128.0,2144-03-01 07:30:00,0.000,0.000,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,117
228499,785,192508.0,30128.0,2144-03-01 08:00:00,0.000,0.000,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,117


In [67]:
vaso_cv_wip = vaso_cv_wip.reset_index()
vaso_cv_wip

Unnamed: 0,icustay_id,subject_id,hadm_id,itemid,charttime,rate,rate_std,has_vasopressors,intime,outtime,hours_in
0,211552,3,145834.0,30043.0,2101-10-20 21:00:00,20.000,0.200,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,1
1,211552,3,145834.0,30128.0,2101-10-20 21:00:00,0.616,0.277,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,1
2,211552,3,145834.0,30043.0,2101-10-20 21:30:00,10.000,0.100,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,2
3,211552,3,145834.0,30043.0,2101-10-20 22:00:00,4.000,0.040,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,2
4,211552,3,145834.0,30128.0,2101-10-20 22:00:00,0.620,0.279,1.0,2101-10-20 19:10:11,2101-10-26 20:43:09,2
...,...,...,...,...,...,...,...,...,...,...,...
8594,228499,785,192508.0,30128.0,2144-03-01 06:00:00,0.000,0.000,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,115
8595,228499,785,192508.0,30128.0,2144-03-01 07:00:00,0.500,0.225,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,116
8596,228499,785,192508.0,30128.0,2144-03-01 07:30:00,0.000,0.000,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,117
8597,228499,785,192508.0,30128.0,2144-03-01 08:00:00,0.000,0.000,1.0,2144-02-25 10:09:53,2144-03-04 13:30:27,117


In [68]:
#labs_vitals = labs_vitals.unstack(level = ['LEVEL2'])
#vaso_cv_wip.columns = vaso_cv_wip.columns.reorder_levels()
   
#X = X.reindex(fill_df.index)


#vaso_cv_wip = vaso_cv_wip.sort_index(axis=0).sort_index(axis=1)

vaso_cv_wip.drop('rate', axis=1, inplace=True)
vaso_cv_wip.set_index('subject_id', inplace=True)
has_vasopressors = vaso_cv_wip.has_vasopressors
vaso_cv_wip.reset_index(inplace=True)

vaso_cv_wip.drop('has_vasopressors', axis=1, inplace=True)
# does it make sense to take the mean of a rate?
X = vaso_cv_wip.groupby(['subject_id', 'hadm_id', 'icustay_id'] +  ['itemid'] + ['hours_in']).agg(['mean', 'std', 'count'])


In [96]:
X.index[::]
#Y = X.reset_index()
Y = Y.set_index('subject_id').join(has_vasopressors)

MultiIndex([(  3, 145834.0, 211552,   1),
            (  3, 145834.0, 211552,   2),
            (  3, 145834.0, 211552,   3),
            (  3, 145834.0, 211552,   4),
            (  3, 145834.0, 211552,   5),
            (  3, 145834.0, 211552,   6),
            (  3, 145834.0, 211552,   7),
            (  3, 145834.0, 211552,   8),
            (  3, 145834.0, 211552,   9),
            (  3, 145834.0, 211552,  10),
            ...
            (785, 192508.0, 228499, 108),
            (785, 192508.0, 228499, 109),
            (785, 192508.0, 228499, 110),
            (785, 192508.0, 228499, 111),
            (785, 192508.0, 228499, 112),
            (785, 192508.0, 228499, 113),
            (785, 192508.0, 228499, 114),
            (785, 192508.0, 228499, 115),
            (785, 192508.0, 228499, 116),
            (785, 192508.0, 228499, 117)],
           names=['subject_id', 'hadm_id', 'icustay_id', 'hours_in'], length=4560)

In [70]:
has_vasopressors.value_counts()

1.0    8421
0.0     178
Name: has_vasopressors, dtype: int64

In [71]:
X.columns = X.columns.droplevel(0)

X.columns.names = ['Aggregated']

X = X.unstack(level = ['itemid'])
X.columns = X.columns.reorder_levels(order=['itemid'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


X = X.sort_index(axis=0).sort_index(axis=1)
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,30043.0,30043.0,30043.0,30047.0,30047.0,30047.0,30051.0,30051.0,30051.0,30119.0,30119.0,30119.0,30120.0,30120.0,30120.0,30127.0,30127.0,30127.0,30128.0,30128.0,30128.0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregated,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834.0,211552,1,1.0,0.20,,,,,,,,,...,,,,,,,,1.0,0.277,
3,145834.0,211552,2,2.0,0.07,0.042426,,,,,,,,...,,,,,,,,1.0,0.279,
3,145834.0,211552,3,,,,,,,,,,,...,,,,,,,,1.0,0.279,
3,145834.0,211552,4,,,,,,,,,,,...,,1.0,0.103,,,,,1.0,0.279,
3,145834.0,211552,5,,,,,,,,,,,...,,1.0,0.103,,,,,1.0,0.279,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,192508.0,228499,113,,,,,,,,,,,...,,,,,,,,1.0,0.113,
785,192508.0,228499,114,,,,,,,,,,,...,,,,,,,,1.0,0.113,
785,192508.0,228499,115,,,,,,,,,,,...,,,,,,,,1.0,0.000,
785,192508.0,228499,116,,,,,,,,,,,...,,,,,,,,1.0,0.225,


## Mechanical Ventilation

## Crystalloid and colloid bolus fluids

## Antiobiotics

## Diagnoses

## Analysis and Plotting

In [None]:
#var_names = list(X.columns.get_level_values('LEVEL2'))
#plot_variable_histograms(var_names, X)
var_names = ['White blood cell count']
#plot_variable_histograms(var_names, X)

#############
# Print the total proportions!
rows, vars = labs_vitals_red.shape
print('')
for l, vals in labs_vitals_red.iteritems():
    ratio = 1.0 * vals.dropna().count() / rows
    print(str(l) + ': ' + str(round(ratio, 3)*100) + '% present')
"""
#############
# Print the per subject proportions!
df = X.groupby(['subject_id']).count()
for k in [1, 2, 3]:
    print('% of subjects had at least ' + str(k) + ' present')
    d = df > k
    d = d.sum(axis=0)
    d = d / len(df)
    d = d.reset_index()
    for index, row in d.iterrows():
        print(str(index) + ': ' + str(round(row[0], 3)*100) + '%')
    print('\n')
"""

print('Done!')