In [None]:
import pandas as pd 
import psycopg2
import getpass
from helpers import *

## Define the mortality cohort

In [None]:
demographics = pd.read_hdf('data.h5', key='demographics') 
labs_vitals = pd.read_hdf('data.h5', key='labs_vitals') 
vaso_mv = pd.read_hdf('data.h5', key='vaso_mv') 
vaso_cv = pd.read_hdf('data.h5', key='vaso_cv') 
mech_vent = pd.read_hdf('data.h5', key='mech_vent') 
colloid_bolus = pd.read_hdf('data.h5', key='colloid_bolus_f') 
crystalloid_bolus = pd.read_hdf('data.h5', key='crystalloid_bolus')
antibiotics = pd.read_hdf('data.h5', key='antibiotics')
diagnoses = pd.read_hdf('data.h5', key='diagnoses') 


In [None]:
labs_vitals

In [None]:
# include patients only on the 1st icu stay, older than 18 and with a length of stay from 1 to 10 days 
print("Number of patients hospitalized: ", len(set((demographics.subject_id))))


In [None]:
print("Number of patients with in hospital mortality: ", demographics.mort_hosp.value_counts())
print("Number of patients with in icu mortality: ", demographics.mort_icu.value_counts())

In [None]:
demographics.hadm_id.value_counts().sort_values()

In [None]:
subject_id_set = set(demographics['subject_id'])
hadm_id_set = set(demographics['hadm_id'])
icustay_id_set = set(demographics['icustay_id'])

## Labs and vitals 

In [None]:

var_map = get_variable_mapping("resources\itemid_to_variable_map.csv")
var_map

In [None]:
chartitems_to_keep = set(var_map.loc[var_map['LINKSTO'] == 'chartevents'].index)
#chartitems_to_keep = set([ str(i) for i in chartitems_to_keep ])

labitems_to_keep = set(var_map.loc[var_map['LINKSTO'] == 'labevents'].index)


### Preprocess labs and vitals 

#### hourly buckets

In [None]:
# the value is str, convert to numeric
labs_vitals['value'] = pd.to_numeric(labs_vitals['value'], 'coerce')
#ID_COLS = ['subject_id', 'hadm_id', 'icustay_id']
#ID_COLS = ['subject_id', 'hadm_id']
#ITEM_COLS = ['itemid', 'label', 'LEVEL1', 'LEVEL2']
#X = X.astype({k: int for k in ID_COLS})


# join and add in labs_vital the icu intime and outime, to separate in hourly buckets
demographics = demographics.set_index("icustay_id")
labs_vitals = labs_vitals.set_index('icustay_id').join(demographics[['intime', 'outtime']])


to_hours = lambda x: max(0, x.days*24 + x.seconds // 3600)

#divide into hourly buckers from intime to chartime 
#labs_vitals['hourly_buckets'] = (labs_vitals['charttime']-labs_vitals['intime'])/pd.Timedelta(minutes=60)
#labs_vitals['hourly_buckets'] = (labs_vitals['charttime']-labs_vitals['intime'])/3600
#labs_vitals['hourly_buckets'] = labs_vitals['hourly_buckets'].apply(lambda x: np.max(x))

labs_vitals['hours_in'] = (labs_vitals['charttime'] - labs_vitals['intime']).apply(to_hours)

In [None]:
labs_vitals.hours_in.min()

#### aggregate same item_ids according to the extract_mimic_paper

In [None]:
labs_vitals = labs_vitals.sort_values(by=['subject_id', 'charttime'])

# to filter the itemids table
itemids = set(labs_vitals.itemid.astype(str))

In [None]:
labs_vitals_copy = labs_vitals.copy(deep=True)

In [None]:


connection = psycopg2.connect(
    user = 'postgres',
    database="mimic",
    password=getpass.getpass("Enter postgres password"), 
    host="127.0.0.1", 
    port = "5433",
    options=f'-c search_path=mimiciii')




query_d_items = \
"""
SELECT itemid, label, dbsource, linksto, category, unitname
FROM d_items
WHERE itemid in ({itemids})
;
""".format(itemids=','.join(itemids))
items_ids = pd.read_sql_query(query_d_items, connection).set_index('itemid')


In [None]:
labs_vitals.drop(columns=['charttime', 'intime', 'outtime'], inplace=True)
labs_vitals.set_index('itemid', append=True, inplace=True)

#labs_vitals = labs_vitals.join(var_map).join(items_ids).set_index(['label', 'LEVEL1', 'LEVEL2'], append=True)
#standardize_units(X, name_col='LEVEL1', inplace=True)
#X = apply_variable_limits(X, var_ranges, 'LEVEL2')
labs_vitals = labs_vitals.join(var_map).join(items_ids).set_index(['label', 'LEVEL2'], append=True)

In [None]:
labs_vitals.head()

In [None]:
# get the names of the aggregated labs and vitals
labs_vitals_names = list(labs_vitals.index.get_level_values('LEVEL2'))
labs_vitals = labs_vitals.groupby(['subject_id', 'hadm_id', 'icustay_id'] + ['LEVEL2'] + ['hours_in']).agg(['mean', 'std', 'count'])

In [None]:
labs_vitals.head()

In [None]:
labs_vitals.columns = labs_vitals.columns.droplevel(0)

labs_vitals.columns.names = ['Aggregated']

demographics['max_hours'] = (demographics['outtime'] - demographics['intime']).apply(to_hours)

In [None]:
labs_vitals.head()

In [None]:
demographics.head()

#### Pivot the table

In [None]:
#missing_hours_fill = range_unnest(demographics, 'max_hours', out_col_name='hours_in', reset_index=True)
#missing_hours_fill['tmp'] = np.NaN


#fill_df = demographics.reset_index()[['subject_id', 'hadm_id', 'icustay_id']].join(missing_hours_fill.set_index('icustay_id'), on='icustay_id')
#fill_df.set_index(['subject_id', 'hadm_id', 'icustay_id'] + ['hours_in'], inplace=True)

# Pivot table droups NaN columns so you lose any uniformly NaN.
labs_vitals = labs_vitals.unstack(level = ['LEVEL2'])
labs_vitals.columns = labs_vitals.columns.reorder_levels(order=['LEVEL2'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


labs_vitals = labs_vitals.sort_index(axis=0).sort_index(axis=1)


In [None]:
labs_vitals

#### Drop columns few recordings

In [None]:
threshold = 0.9
columns_to_drop = []
for column_name in labs_vitals.columns:
    #print(column_name)
    if column_name[1] == 'mean':
        #get the only the means of the vital_labs
        if labs_vitals[column_name].isnull().mean() > threshold:
            columns_to_drop.append(column_name[0])

labs_vitals_red = labs_vitals.drop(columns=columns_to_drop)

#### Apply variable limits - outliers

## Vasopressors 

In [None]:
# add the remaining patients that did not have vasopressors with label in has_vasopressors = 0 
demographics.reset_index(inplace=True)
patients_missing_vaso = demographics[~demographics.subject_id.isin(vaso_cv.subject_id)][['subject_id', 'hadm_id', 'icustay_id']]

patients_missing_vaso.head()

In [None]:
demographics.head()

In [None]:
#vaso_cv_wip = pd.DataFrame([])
#vaso_cv_wip = vaso_cv.append(patients_missing_vaso)

In [None]:
patients_missing_vaso

In [None]:
vaso_cv_wip = vaso_cv.sort_values(["subject_id", "charttime"])
#vaso_cv_wip.reset_index(drop=True)
vaso_cv_wip

In [None]:
vaso_cv_wip['has_vasopressors'] = vaso_cv_wip['has_vasopressors'].fillna(0)
vaso_cv_wip

In [None]:
demographics = demographics.set_index('icustay_id')
vaso_cv_wip = vaso_cv_wip.set_index('icustay_id').join(demographics[['intime', 'outtime']])
vaso_cv_wip['hours_in'] = (vaso_cv_wip['charttime'] - vaso_cv_wip['intime']).apply(to_hours)

In [None]:
vaso_cv_wip = vaso_cv_wip.sort_values(["subject_id", "charttime"])
vaso_cv_wip

In [None]:
vaso_cv_wip = vaso_cv_wip.reset_index()
vaso_cv_wip

In [None]:
vaso_cv_wip.has_vasopressors.value_counts()

In [None]:
#labs_vitals = labs_vitals.unstack(level = ['LEVEL2'])
#vaso_cv_wip.columns = vaso_cv_wip.columns.reorder_levels()
   
#X = X.reindex(fill_df.index)


#vaso_cv_wip = vaso_cv_wip.sort_index(axis=0).sort_index(axis=1)

vaso_cv_wip.drop(['rate', 'has_vasopressors'], axis=1, inplace=True)
vaso_cv_wip.set_index('subject_id', inplace=True)
#
vaso_cv_wip.reset_index(inplace=True)

#
# does it make sense to take the mean of a rate?
X = vaso_cv_wip.groupby(['subject_id', 'hadm_id', 'icustay_id'] +  ['itemid'] + ['hours_in']).agg(['mean', 'std', 'count'])


In [None]:
X

In [None]:
X.columns = X.columns.droplevel(0)

X.columns.names = ['Aggregated']

X = X.unstack(level = ['itemid'])
X.columns = X.columns.reorder_levels(order=['itemid'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


X = X.sort_index(axis=0).sort_index(axis=1)
X

In [None]:
threshold = 0.99
columns_to_drop = []
for column_name in X.columns:
    #print(column_name)
    if column_name[1] == 'mean':
        #get the only the means of the vital_labs
        if X[column_name].isnull().mean() > threshold:
            columns_to_drop.append(column_name[0])

X_red = X.drop(columns=columns_to_drop)

In [None]:
X_red

In [None]:
#############
# Print the total proportions!
rows, vars = X_red.shape
print('')
for l, vals in X_red.iteritems():
    ratio = 1.0 * vals.dropna().count() / rows
    print(str(l) + ': ' + str(round(ratio, 3)*100) + '% present')

## Mechanical Ventilation

In [None]:
mech_vent

In [None]:
# add the remaining patients that did not have vasopressors with label in has_vasopressors = 0 
demographics.reset_index(inplace=True)
patients_missing_mech_vent = demographics[~demographics.subject_id.isin(mech_vent.subject_id)][['subject_id', 'hadm_id', 'icustay_id']]

patients_missing_mech_vent.head()

In [None]:
mech_vent_wip = mech_vent.sort_values(["subject_id", "charttime"])
#vaso_cv_wip.reset_index(drop=True)
mech_vent_wip.head()

In [None]:
demographics = demographics.set_index('icustay_id')
mech_vent_wip = mech_vent_wip.set_index('icustay_id').join(demographics[['intime', 'outtime']])
mech_vent_wip['hours_in'] = (mech_vent_wip['charttime'] - mech_vent_wip['intime']).apply(to_hours)
mech_vent_wip

In [None]:
mech_vent_wip = mech_vent_wip.sort_values(["subject_id", "charttime"])
mech_vent_wip

In [None]:
mech_vent_wip = mech_vent_wip.reset_index()

mech_vent_wip.drop(['intime', 'outtime'], axis=1, inplace=True)
mech_vent_wip.set_index('subject_id', inplace=True)
#
mech_vent_wip.reset_index(inplace=True)
# does it make sense to take the mean of a rate?
mech_vent_wip = mech_vent_wip.groupby(['subject_id', 'hadm_id', 'icustay_id']  + ['hours_in']).agg(['min', 'count'])
mech_vent_wip


## Crystalloid and colloid bolus fluids

In [None]:
# here we have listed the amount of a drug or 
# substance administered to the patient either between 
# the STARTTIME and ENDTIME (if both are available) or at the ENDTIME 
colloid_bolus

In [None]:
# add the remaining patients that did not have vasopressors with label in has_vasopressors = 0 
demographics.reset_index(inplace=True)
patients_missing_colloid_bolus = demographics[~demographics.subject_id.isin(colloid_bolus.subject_id)][['subject_id', 'hadm_id', 'icustay_id']]

colloid_bolus_wip = colloid_bolus.sort_values(["subject_id", "charttime"])

demographics = demographics.set_index('icustay_id')
colloid_bolus_wip = colloid_bolus_wip.set_index('icustay_id').join(demographics[['intime', 'outtime']])
colloid_bolus_wip['hours_in'] = (colloid_bolus_wip['charttime'] - colloid_bolus_wip['intime']).apply(to_hours)
colloid_bolus_wip



In [None]:
colloid_bolus_wip = colloid_bolus_wip.sort_values(["subject_id", "charttime"])
colloid_bolus_wip = colloid_bolus_wip.reset_index()

colloid_bolus_wip.drop(['intime', 'outtime'], axis=1, inplace=True)
colloid_bolus_wip.set_index('subject_id', inplace=True)
#
colloid_bolus_wip.reset_index(inplace=True)
# does it make sense to take the mean of a rate?
colloid_bolus_wip = colloid_bolus_wip.groupby(['subject_id', 'hadm_id', 'icustay_id']  + ['hours_in']).agg(['mean', 'std', 'count'])
colloid_bolus_wip

In [None]:

demographics.reset_index(inplace=True)
patients_missing_crystalloid_bolus = demographics[~demographics.subject_id.isin(crystalloid_bolus.subject_id)][['subject_id', 'hadm_id', 'icustay_id']]

crystalloid_bolus_wip = crystalloid_bolus.sort_values(["subject_id", "charttime"])

demographics = demographics.set_index('icustay_id')
crystalloid_bolus_wip = crystalloid_bolus_wip.set_index('icustay_id').join(demographics[['intime', 'outtime']])
crystalloid_bolus_wip['hours_in'] = (crystalloid_bolus_wip['charttime'] - crystalloid_bolus_wip['intime']).apply(to_hours)
crystalloid_bolus_wip = crystalloid_bolus_wip.sort_values(["subject_id", "charttime"])
crystalloid_bolus_wip = crystalloid_bolus_wip.reset_index()

crystalloid_bolus_wip.drop(['intime', 'outtime'], axis=1, inplace=True)
crystalloid_bolus_wip.set_index('subject_id', inplace=True)
#
crystalloid_bolus_wip.reset_index(inplace=True)
# does it make sense to take the mean of a rate?
crystalloid_bolus_wip = crystalloid_bolus_wip.groupby(['subject_id', 'hadm_id', 'icustay_id']  + ['hours_in']).agg(['mean', 'std', 'count'])
crystalloid_bolus_wip

## Antiobiotics

In [None]:
antibiotics

In [None]:

demographics.reset_index(inplace=True)
patients_missing_antibiotics = demographics[~demographics.subject_id.isin(antibiotics.subject_id)][['subject_id', 'hadm_id', 'icustay_id']]

antibiotics_wip = antibiotics.sort_values(["subject_id", "startdate"])

demographics = demographics.set_index('icustay_id')
antibiotics_wip = antibiotics_wip.set_index('icustay_id').join(demographics[['intime', 'outtime']])
antibiotics_wip['hours_in'] = (antibiotics_wip['startdate'] - antibiotics_wip['intime']).apply(to_hours)
antibiotics_wip = antibiotics_wip.sort_values(["subject_id", "startdate"])
antibiotics_wip = antibiotics_wip.reset_index()

antibiotics_wip.drop(['intime', 'outtime', 'startdate', 'enddate', 'drug_name_generic', 'route'], axis=1, inplace=True)
antibiotics_wip.set_index('subject_id', inplace=True)
#
antibiotics_wip.reset_index(inplace=True)
# does it make sense to take the mean of a rate?
antibiotics_wip = antibiotics_wip.groupby(['subject_id', 'hadm_id', 'icustay_id']  + ['drug'] + ['hours_in']).agg(['count'])
antibiotics_wip

In [None]:
antibiotics_wip.columns = antibiotics_wip.columns.droplevel(0)

antibiotics_wip.columns.names = ['Aggregated']

antibiotics_wip = antibiotics_wip.unstack(level = ['drug'])
antibiotics_wip.columns = antibiotics_wip.columns.reorder_levels(order=['drug'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


antibiotics_wip = antibiotics_wip.sort_index(axis=0).sort_index(axis=1)
antibiotics_wip

In [None]:
threshold = 0.99
columns_to_drop = []
for column_name in antibiotics_wip.columns:
    #print(column_name)
    if column_name[1] == 'count':
        #get the only the means of the vital_labs
        if antibiotics_wip[column_name].isnull().mean() > threshold:
            columns_to_drop.append(column_name[0])

antibiotics_wip_red = antibiotics_wip.drop(columns=columns_to_drop)
antibiotics_wip_red

## Diagnoses

In [None]:
diagnoses

In [None]:
diagnoses_wip = diagnoses.groupby(['subject_id', 'hadm_id']  + ['icd9_title']).agg(['count'])
diagnoses_wip

In [None]:
diagnoses_wip.columns = diagnoses_wip.columns.droplevel(0)

diagnoses_wip.columns.names = ['Aggregated']

diagnoses_wip = diagnoses_wip.unstack(level = ['icd9_title'])
diagnoses_wip.columns = diagnoses_wip.columns.reorder_levels(order=['icd9_title'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


diagnoses_wip = diagnoses_wip.sort_index(axis=0).sort_index(axis=1)
diagnoses_wip

## Analysis and Plotting

In [None]:
#var_names = list(X.columns.get_level_values('LEVEL2'))
#plot_variable_histograms(var_names, X)
var_names = ['White blood cell count']
#plot_variable_histograms(var_names, X)

#############
# Print the total proportions!
rows, vars = labs_vitals_red.shape
print('')
for l, vals in labs_vitals_red.iteritems():
    ratio = 1.0 * vals.dropna().count() / rows
    print(str(l) + ': ' + str(round(ratio, 3)*100) + '% present')
"""
#############
# Print the per subject proportions!
df = X.groupby(['subject_id']).count()
for k in [1, 2, 3]:
    print('% of subjects had at least ' + str(k) + ' present')
    d = df > k
    d = d.sum(axis=0)
    d = d / len(df)
    d = d.reset_index()
    for index, row in d.iterrows():
        print(str(index) + ': ' + str(round(row[0], 3)*100) + '%')
    print('\n')
"""

print('Done!')