# Data extraction for Prediction task in time and last version of data extraction

In [2]:
from __future__ import absolute_import
from __future__ import print_function
from pandas import DataFrame, Series
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals.joblib import dump, load
import os
import csv
import sys
import pandas as pd
import numpy as np
import re
import os
import shutil
eicu_path = "the directory that indludes eicu csv files"
root_path = "the directory that includes the main csv data"
data_processed_path = "directory to load the extracted data"
def dataframe_from_csv(path, header=0, index_col=False):
    return pd.read_csv(path, header=header, index_col=index_col)

In [3]:
pats = dataframe_from_csv(os.path.join(eicu_path, 'patient.csv'),index_col=False)

In [None]:
pats.head(5)

In [6]:
###############################
# Non-time series preprocessing
###############################
g_map = {'Female': 1, 'Male': 2, '': 0, 'NaN': 0, 'Unknown': 0, 'Other': 0}
def transform_gender(gender_series):
    global g_map
    return {'gender': gender_series.fillna('').apply(lambda s: g_map[s] if s in g_map else g_map[''])}
e_map = {'Asian': 1, 'African American': 2, 'Caucasian': 3, 'Hispanic': 4, 'Native American': 5, 'NaN': 0, '': 0}
def transform_ethnicity(ethnicity_series):
    global e_map
    return {'ethnicity': ethnicity_series.fillna('').apply(lambda s: e_map[s] if s in e_map else e_map[''])}
h_s_map = {'Expired': 0, 'Alive': 1, '': 2, 'NaN': 2}
def transform_hospital_discharge_status(status_series):
    global h_s_map
    return {'hospitaldischargestatus': status_series.fillna('').apply(
        lambda s: h_s_map[s] if s in h_s_map else h_s_map[''])}
def transform_unit_discharge_status(status_series):
    global h_s_map
    return {'unitdischargestatus': status_series.fillna('').apply(
        lambda s: h_s_map[s] if s in h_s_map else h_s_map[''])}
def transform_dx_into_id(df):
    dx_type = df.apacheadmissiondx.unique()
    dict_dx_key = pd.factorize(dx_type)[1]
    dict_dx_val = pd.factorize(dx_type)[0]
    dictionary = dict(zip(dict_dx_key, dict_dx_val))
    df['apacheadmissiondx'] = df['apacheadmissiondx'].map(dictionary)
    return df

In [7]:
def read_patients_table(eicu_path, root_path):
    pats = dataframe_from_csv(os.path.join(eicu_path, 'patient.csv'),index_col=False)
    pats = filter_patients_on_age(pats, min_age=18, max_age=89)
    pats = filter_patients_on_columns(pats)
    pats.update(transform_gender(pats.gender))
    pats.update(transform_ethnicity(pats.ethnicity))
    pats.update(transform_hospital_discharge_status(pats.hospitaldischargestatus))
    pats.update(transform_unit_discharge_status(pats.unitdischargestatus))
    pats = transform_dx_into_id(pats)
    pats.to_csv(os.path.join(root_path, 'all_stays.csv'), index=False)
    pats = filter_patients_on_columns_model(pats)
    return pats

def filter_patients_on_age(patient, min_age=18, max_age=89):
    patient.ix[patient['age'] == '> 89','age'] = 90
    patient[['age']] = patient[['age']].fillna(-1)
    patient[['age']] = patient[['age']].astype(int)
    patient = patient.ix[(patient.age >= min_age) & (patient.age <= max_age)]
    return patient

def filter_patients_on_columns(patients):
    columns = ['patientunitstayid','gender', 'age', 'ethnicity','apacheadmissiondx',
        'hospitaladmityear',  'hospitaldischargeyear','hospitaldischargeoffset','uniquepid',        
        'admissionheight','hospitaladmitoffset', 'admissionweight',
        'hospitaldischargestatus','unitdischargeoffset', 'unitdischargestatus']
    return patients[columns]
def filter_patients_on_columns_model(patients):
    columns = ['patientunitstayid','gender', 'age', 'ethnicity','apacheadmissiondx','uniquepid',
         'admissionheight','hospitaladmitoffset','admissionweight',
         'hospitaldischargestatus','unitdischargeoffset','unitdischargestatus']
    return patients[columns]

In [None]:
from __future__ import absolute_import
from __future__ import print_function
import os
patients = read_patients_table(eicu_path,root_path)
patient_cohort = patients["patientunitstayid"].unique()

In [None]:
patients.head()

In [None]:
patients.shape

In [None]:
patients.groupby(['uniquepid']).head(1).shape

In [None]:
139367 - 133409

# Nurse Charting

In [None]:
nc = dataframe_from_csv(os.path.join(eicu_path, 'nurseCharting.csv'),index_col=False)
ds = nc[nc['nursingchartcelltypevalname']=="Delirium Score"].patientunitstayid.unique()
cam = nc[nc['nursingchartvalue']=="CAM-ICU"].patientunitstayid.unique()
cam_cohort = set(ds).intersection(set(cam))
cam_cohort = np.array(list(cam_cohort.intersection(set(patient_cohort))))
cam_cohort = np.array(cam_cohort)
cam_cohort = cam_cohort.astype(int)


In [None]:
nc.patientunitstayid.nunique(), cam_cohort.shape[0]

In [8]:
nc = nc[nc['patientunitstayid'].isin(cam_cohort)]

In [9]:
nc.to_csv(os.path.join(data_processed_path, 'nc.csv'),index=False)

In [11]:

def filter_nc_on_columns(nc):
    columns =['patientunitstayid','nursingchartoffset','nursingchartcelltypevallabel',
              'nursingchartcelltypevalname','nursingchartvalue']
    return nc[columns]
def rename_nc_columns(nc):
    nc.rename(index=str, columns={"nursingchartoffset": "itemoffset",
                                  "nursingchartcelltypevalname":"itemname",
                                  "nursingchartcelltypevallabel" : "itemlabel",
                                  "nursingchartvalue": "itemvalue"}, inplace=True)
    return nc
def item_name_selected_from_nc(nc,label,name):
    nc = nc[(nc.itemname.isin(name)) & (nc.itemlabel.isin(label))]
    return nc
def unify_itemname_nc(nc):  #SYNC
    nc.loc[nc['itemname']=='Value','itemname'] = nc.itemlabel
    nc.loc[nc['itemname']=='Non-Invasive BP Systolic','itemname'] = 'BP Systolic'
    nc.loc[nc['itemname']=='Non-Invasive BP Diastolic','itemname'] = 'BP Diastolic'
    nc.loc[nc['itemname']=='Invasive BP Systolic','itemname'] = 'BP Systolic'
    nc.loc[nc['itemname']=='Invasive BP Diastolic','itemname'] = 'BP Diastolic'
    nc.loc[nc['itemlabel']=='Arterial Line MAP (mmHg)','itemname'] = 'MAP (mmHg)'
    nc.loc[nc['itemlabel']=='Invasive BP Mean','itemname'] = 'MAP (mmHg)'
    nc.loc[nc['itemlabel']=='Non-Invasive BP Mean','itemname'] = 'MAP (mmHg)'
    nc.loc[nc['itemlabel']=='SpO2','itemname'] = 'O2 Saturation'
    nc.loc[nc['itemlabel']=='Bedside Glucose','itemname'] = 'glucose'
    return nc

In [12]:
def read_nc_table(data_processed_path):    #SYNC
    nc = dataframe_from_csv(os.path.join(data_processed_path, 'nc.csv'),index_col=False)
    nc = filter_nc_on_columns(nc)
    nc = rename_nc_columns(nc)
      vitals = [['O2 Saturation' ,'O2 Saturation'],
          ['SpO2', 'Value'],
          ['Heart Rate' ,'Heart Rate'],
          ['Temperature' ,'Temperature (C)'],
          ['Bedside Glucose' ,'Bedside Glucose'],
          ['Delirium Scale/Score' ,'Delirium Score'],
          ['Glasgow coma score' ,'Verbal'],
          ['Glasgow coma score' ,'GCS Total'],
          ['Glasgow coma score' ,'Eyes'],
          ['Glasgow coma score' ,'Motor'],
          ['Glasgow coma score' ,'Verbal'],
          ['Non-Invasive BP' ,'Non-Invasive BP Systolic'],
          ['Non-Invasive BP' ,'Non-Invasive BP Diastolic'],
          ['Invasive BP' ,'Invasive BP Systolic'],
          ['Invasive BP' ,'Invasive BP Diastolic'],
          ['MAP (mmHg)' ,'Value'],
          ['Sedation Scale/Score/Goal','Sedation Score'],
          ['ICP', 'ICP'],
          ['CI','CI'],
          ['Respiratory Rate','Respiratory Rate']] 

    label , name = [],[]
    for v in vitals:
        label.append(v[0])
        name.append(v[1])
    nc = item_name_selected_from_nc(nc,label,name)
    nc = unify_itemname_nc(nc)
    del nc['itemlabel']
    return nc


In [14]:
nc = read_nc_table(data_processed_path)

In [15]:
nc_items = nc.itemname.unique()

In [None]:
nc_items

In [17]:
def break_up_stays_by_unit_stay_nc(nursecharting, root_path, stayid=None, verbose=1):
    unit_stays = nursecharting.patientunitstayid.unique() if stayid is None else stayid
    nb_unit_stays = unit_stays.shape[0]
    for i, stay_id in enumerate(unit_stays):
        if verbose:
            sys.stdout.write('\rStayID {0} of {1}...'.format(i+1, nb_unit_stays))
        dn = os.path.join(root_path, str(stay_id))
        try:
            os.makedirs(dn)
        except:
            pass
        nursecharting.ix[nursecharting.patientunitstayid == stay_id].sort_values(by='itemoffset').to_csv(os.path.join(dn, 'nc.csv'), index=False)
    if verbose:
        sys.stdout.write('DONE!\n')

In [18]:
len(cam_cohort)

24628

In [None]:
break_up_stays_by_unit_stay_nc(nc,root_path,stayid=cam_cohort, verbose=1)

# Patients 

In [20]:
def break_up_stays_by_unit_stay(pats, root_path, stayid=None, verbose=1):
    unit_stays = pats.patientunitstayid.unique() if stayid is None else stayid
    nb_unit_stays = unit_stays.shape[0]
    for i, stay_id in enumerate(unit_stays):
        if verbose:
            sys.stdout.write('\rStayID {0} of {1}...'.format(i+1, nb_unit_stays))
        dn = os.path.join(root_path, str(stay_id))
        try:
            os.makedirs(dn)
        except:
            pass
        pat = pats.loc[pats["patientunitstayid"] == stay_id]
        pat.to_csv(os.path.join(dn, 'pat.csv'), index=False)
    if verbose:
        sys.stdout.write('DONE!\n')

In [None]:
patients.shape

In [None]:
break_up_stays_by_unit_stay(patients, root_path,stayid=cam_cohort, verbose=1)

# similiar Lab items, NC items and InfDrg items need to be unified 

### Lab Functions


In [None]:
lab = dataframe_from_csv(os.path.join(eicu_path, 'lab.csv'),index_col=False)

In [None]:
lab.head()

In [25]:
def filter_lab_on_columns(lab):
    columns = ['patientunitstayid','labresultoffset', 'labname', 'labresult']
    return lab[columns]

def rename_lab_columns(lab):
    lab.rename(index=str, columns={"labresultoffset": "itemoffset",
                                   "labname": "itemname", "labresult": "itemvalue"}, inplace=True)
    return lab

def item_name_selected_from_lab(lab,items):
    lab= lab[lab['itemname'].isin(items)]
    return lab

def check(x):
    try:
        x = float(str(x).strip())
    except:
        x = np.nan
    return x

def check_itemvalue(df):
    df['itemvalue'] = df['itemvalue'].apply(lambda x: check(x))
    df['itemvalue'] = df['itemvalue'].astype(float)
    return df

def read_lab_table(eicu_path):
    lab = dataframe_from_csv(os.path.join(eicu_path, 'lab.csv'),index_col=False)
    
    items = ['O2 Sat (%)','WBC x 1000','sodium','BUN',
             'bedside glucose','glucose', 
             'direct bilirubin',
             'Hgb','platelets x 1000','potassium','chloride','bicarbonate',
             'creatinine','ALT (SGPT)','AST (SGOT)','alkaline phos.',
             'lactate','pH','ammonia','cortisol','TSH','serum osmolality']

    lab = filter_lab_on_columns(lab)  
    lab = rename_lab_columns(lab)
    lab = item_name_selected_from_lab(lab,items) 
    lab.loc[lab['itemname'] == 'bedside glucose', 'itemname'] = 'glucose'  
    lab.loc[lab['itemname'] == 'O2 Sat (%)', 'itemname'] = 'O2 Saturation'
    lab = check_itemvalue(lab)
    return lab

In [None]:
lab = read_lab_table(eicu_path) 

In [27]:
def break_up_lab_by_unit_stay(lab, root_path, stayid=None, verbose=1):
    unit_stays = lab.patientunitstayid.unique() if stayid is None else stayid
    nb_unit_stays = unit_stays.shape[0]
    for i, stay_id in enumerate(unit_stays):
        if verbose:
            sys.stdout.write('\rStayID {0} of {1}...'.format(i+1, nb_unit_stays))
        dn = os.path.join(root_path, str(stay_id))
        try:
            os.makedirs(dn)
        except:
            pass
        lab.ix[lab.patientunitstayid == stay_id].sort_values(by='itemoffset').to_csv(os.path.join(dn, 'lab.csv'), index=False)
    if verbose:
        sys.stdout.write('DONE!\n')

In [None]:
lab.shape

In [29]:
lab_items = lab.itemname.unique()

In [None]:
break_up_lab_by_unit_stay(lab,root_path,stayid = cam_cohort, verbose = 1)

### Read each patient nc, lab and demographics and put all in one csv file named as time_series{ID}.csv

In [None]:
def convert_events_to_timeseries(events, variable_column='itemname', variables=[]):
    metadata = events[['itemoffset', 'patientunitstayid']].sort_values(by=['itemoffset', 'patientunitstayid'])\
                    .drop_duplicates(keep='first').set_index('itemoffset')
    timeseries = events[['itemoffset', variable_column, 'itemvalue']]\
                    .sort_values(by=['itemoffset', variable_column, 'itemvalue'], axis=0)\
                    .drop_duplicates(subset=['itemoffset', variable_column], keep='last')
    timeseries = timeseries.pivot(index='itemoffset', columns=variable_column, values='itemvalue').merge(metadata, left_index=True, right_index=True)\
                    .sort_index(axis=0).reset_index()
    for v in variables:
        if v not in timeseries:
            timeseries[v] = np.nan
    return timeseries

In [None]:
nc_items

In [None]:
len(lab_items), len(nc_items)

In [None]:
var_to_consider = list(lab_items) + list(nc_items)

In [None]:
len(var_to_consider)

In [None]:
var_to_consider

# Binning

In [None]:
def binning(df, x=60):
    df['itemoffset'] = (df['itemoffset']/x).astype(int)
    df = df.groupby('itemoffset').apply(lambda x: x.fillna(x.mean()))
    df = df.droplevel(0,axis=0)
    df.drop_duplicates(subset=['itemoffset'], keep='last',inplace=True)
    return df

# Extract Time series

In [None]:
def extract_time_series_from_subject(t_path):
    for stay_dir in os.listdir(t_path):
        dn = os.path.join(t_path, stay_dir)
        try:
            stay_id = int(stay_dir)
            if not os.path.isdir(dn):
                raise Exception
        except:
            continue
        try:
            pat = dataframe_from_csv(os.path.join(t_path, stay_dir, 'pat.csv'))
            lab = dataframe_from_csv(os.path.join(t_path, stay_dir, 'lab.csv'))
            nc = dataframe_from_csv(os.path.join(t_path, stay_dir, 'nc.csv'))
            nclab = pd.concat([nc, lab]).sort_values(by=['itemoffset'])
            timeepisode = convert_events_to_timeseries(nclab, variables=var_to_consider)
            nclabpat = pd.merge(timeepisode, pat, on='patientunitstayid')
            df = binning(nclabpat, 60)
            df.to_csv(os.path.join(t_path, stay_dir, 'timeseries.csv'), index=False)
            sys.stdout.write('\rWrite StayID {0}...'.format(stay_id))
        except:
            continue
    print('DONE')

In [None]:
extract_time_series_from_subject(root_path)

### Delete folders without timeseries file

In [40]:
import shutil
def delete_wo_timeseries(t_path):
    for stay_dir in os.listdir(t_path):
        dn = os.path.join(t_path, stay_dir)
        try:
            stay_id = int(stay_dir)
            if not os.path.isdir(dn):
                raise Exception
        except:
            continue
        try:
            sys.stdout.flush()
            if not os.path.isfile(os.path.join(dn,'timeseries.csv')):
                shutil.rmtree(dn)   
        except :
            continue
    print('DONE')


In [41]:
delete_wo_timeseries(root_path)

DONE


### All the data in one dataframe 

In [42]:
import pandas as pd
import os
import sys

unit_stays  = pd.Series(os.listdir(root_path))
unit_stays = list((filter(str.isdigit, unit_stays)))

all_filenames = []
for stay_id in(unit_stays):
    df_file = os.path.join(root_path, str(stay_id),'timeseries.csv')
    all_filenames.append(df_file)

In [None]:
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv(os.path.join(data_processed_path, 'all_data_delirium_eicu.csv'), index=False)

# Preprocessing

# Read data from here


In [None]:
patients = read_patients_table(eicu_path,root_path)

In [None]:
patients.columns

In [15]:
patients_ = patients[['patientunitstayid','uniquepid']]

In [None]:
patients_.head(1)

In [None]:
all_data_deli = pd.read_csv(os.path.join(data_processed_path, 'all_data_delirium_eicu.csv'))
all_data_deli.head(1)

In [None]:
all_data_deli.shape

In [19]:
all_data_deli = pd.merge(all_data_deli, patients_, how='left', left_on=['patientunitstayid'],right_on=['patientunitstayid'])

In [None]:
all_data_deli.head(1)

In [None]:
all_data_deli["Delirium Score"].unique()

In [24]:
all_data_deli["Delirium Score"] = all_data_deli["Delirium Score"].str.lower()
df = all_data_deli[(all_data_deli["Delirium Score"]=='yes')|(all_data_deli["Delirium Score"]=='no')|(all_data_deli["Delirium Score"].isna())]

In [None]:
df["Delirium Score"].value_counts()

In [None]:
df.rename(index=str, columns={"Hgb": "Hemoglobin",
                                         "platelets x 1000": "Platelets",
                                          "potassium":"Potassium",
                                          "chloride" : "Chloride",
                                          "bicarbonate": "Bicarbonate",
                                          "creatinine": "Creatinine",
                                          "ALT (SGPT)": "ALT",
                                          "AST (SGOT)": "AST",
                                          "alkaline phos.": "Alkaline Phosphate",
                                          "Delirium Score": "CAM"}, inplace=True)

In [None]:
df.groupby("patientunitstayid").count().shape

In [None]:
df.groupby("uniquepid").count().shape

In [None]:
df.head()

# Add sofa score

In [30]:
df_eicu = df.copy()
sofa = pd.read_csv(os.path.join(data_processed_path, 'eicu_pivoted_sofa.csv'))

df_eicu['day'] = np.nan

for i in range(1,1000):
    df_eicu.loc[((df_eicu['itemoffset'] <= i*24) & (df_eicu['itemoffset'] >= (i-1)*24)),'day'] = i
    
new_df = pd.merge(df_eicu, sofa, how='left', left_on=['patientunitstayid','day'],right_on=['patientunitstayid','day'])

In [31]:
columns_order = ['patientunitstayid','uniquepid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate','sofa', 'sofa_wo_gcs','CAM']


In [32]:
new_df = new_df[columns_order]

In [None]:
new_df.head()

In [34]:
d_map = {'no': 0, 'yes': 1,'':2}

def transform_deli(deli_series):
    global d_map
    return {'CAM': deli_series.fillna('').apply(lambda s: d_map[s] if s in d_map else d_map[''])}

new_df.update(transform_deli(new_df['CAM']))
new_df ["CAM"] = new_df["CAM"].astype(int)

## Add other variables

In [36]:
df_eicu = new_df.copy()
df_vent = pd.read_csv(os.path.join(data_processed_path, 'eicu_wes.csv'))

In [37]:
new_df = pd.merge(df_eicu, df_vent, how='left', left_on=['patientunitstayid','itemoffset'],right_on=['patientunitstayid','hr'])

In [38]:
all_deli = new_df.copy()

In [41]:
columns_order = ['patientunitstayid','uniquepid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa', 'sofa_wo_gcs',
       'vent_flag','rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine', 'fluidin', 'fluidout','CAM']

In [42]:
all_deli = all_deli[columns_order]

# Labelling

In [43]:
all_deli = all_deli[all_deli['itemoffset'] > -7]

In [44]:
label_deli = all_deli.copy()
label_deli['labelrec'] = np.nan
label_deli.loc[label_deli['CAM']==1,'labelrec']=1
label_deli.loc[label_deli['CAM']==0,'labelrec']=0

label_deli['labelpt'] = np.nan

pos_cam_coh = label_deli[label_deli['labelrec']==1]['patientunitstayid'].unique()
label_deli.loc[label_deli['patientunitstayid'].isin(pos_cam_coh), 'labelpt']=1
label_deli.loc[~(label_deli['patientunitstayid'].isin(pos_cam_coh)), 'labelpt']=0

In [45]:
columns_order = ['patientunitstayid','uniquepid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa', 'sofa_wo_gcs',
       'vent_flag','rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine', 'fluidin', 'fluidout','CAM','labelrec','labelpt']

In [46]:
label_deli = label_deli[columns_order]

In [47]:
label_deli.columns

Index(['patientunitstayid', 'uniquepid', 'itemoffset', 'gender', 'age',
       'admissionheight', 'admissionweight', 'Heart Rate', 'O2 Saturation',
       'glucose', 'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000',
       'direct bilirubin', 'Hemoglobin', 'Platelets', 'Potassium', 'Chloride',
       'Bicarbonate', 'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa',
       'sofa_wo_gcs', 'vent_flag', 'rate_dopamine', 'rate_epinephrine',
       'rate_norepinephrine', 'rate_phenylephrine', 'fluidin', 'fluidout',
       'CAM', 'labelrec', 'labelpt'],
      dtype='object')

# Missing values

In [49]:
new_df = label_deli.copy()

## Record-wise

In [None]:
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = columns_order 
percent_missing = new_df[columns].isnull().sum() * 100 / len(new_df)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

## Patient-wise


In [None]:
df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())

for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})

    
    #after Imputation

import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

In [None]:
new_df.head()

# Pearson correlation

## all data

In [None]:
new_df.columns

In [55]:
columns_for_corr = ['Age', 'Height',
       'Weight', 'Heart Rate', 'O2 Saturation', 'Glucose',
       'Temperature', 'Sodium', 'BUN', 'WBC', 
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine','Ventilation','Vasopressor dose','Gender','Sofa', 'Sofa_wo_gcs',  'CAM']

In [57]:
new_df.rename(index=str, columns={"admissionheight": "Height",
                                  "admissionweight":"Weight",
                                  "glucose" : "Glucose",
                                  "sodium" : "Sodium",
                                  "vent_flag" : "Ventilation",
                                  "rate_dopamine" : "Dopamine",
                                  "rate_epinephrine" : "Epinephrine",
                                  "rate_norepinephrine":"Norepinephrine",
                                  "rate_phenylephrine":"Phenylephrine",
                                  "gender":"Gender",
                                  "sofa":"Sofa",
                                  "sofa_wo_gcs":"Sofa_wo_gcs",
                                  "Temperature (C)" : "Temperature",
                                  "WBC x 1000": "WBC",
                                  "age":"Age"}, inplace=True)

In [58]:
new_df['Epinephrine'].fillna(value=0,inplace=True)
new_df['Norepinephrine'].fillna(value=0,inplace=True) 
new_df['Phenylephrine'].fillna(value=0,inplace=True)
new_df['Dopamine'].fillna(value=0,inplace=True)

In [59]:
new_df['Epinephrine'].fillna(value=0,inplace=True)
new_df['Norepinephrine'].fillna(value=0,inplace=True) 
new_df['Phenylephrine'].fillna(value=0,inplace=True)
new_df['Dopamine'].fillna(value=0,inplace=True)
new_df['Vasopressor dose'] = np.nan
new_df['Vasopressor dose'] = new_df['Epinephrine']+new_df['Norepinephrine'] + new_df['Phenylephrine']/10 + new_df['Dopamine']/2
new_df.drop(columns=['Epinephrine', 'Norepinephrine','Phenylephrine','Dopamine'],inplace=True)

In [60]:
columns_for_corr = ['Age', 'Height',
       'Weight', 'Heart Rate', 'O2 Saturation', 'Glucose',
       'Temperature', 'Sodium', 'BUN', 'WBC', 
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine','Ventilation','Vasopressor dose','Gender','Sofa', 'Sofa_wo_gcs',  'CAM']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

colormap = plt.cm.RdBu

mask = np.zeros(new_df[columns_for_corr].corr().shape, dtype=bool)
mask[np.tril_indices(len(mask))] = True
mask = ~mask


plt.figure(figsize=(10,10))

sns.set(font_scale=1.4)
plt.title('Pearson Correlation of Features', y=1.05, size=15)

sns.heatmap(new_df[columns_for_corr].corr(), mask = mask, linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=False)

plt.savefig('eicu_corr_jama.png',dpi=450, facecolor='white', bbox_inches = 'tight',transparent=True)
plt.show()

# Imputation

In [163]:
def check(x):
    try:
        x = float(str(x).strip())
    except:
        x = np.nan
    return x

def check_itemvalue(df):
    for c in df.columns:
        df[c] = df[c].apply(lambda x: check(x))
    return df


In [164]:
new_df.reset_index(inplace=True,drop=True)

In [165]:
new_df.columns

Index(['patientunitstayid', 'uniquepid', 'itemoffset', 'Gender', 'Age',
       'Height', 'Weight', 'Heart Rate', 'O2 Saturation', 'Glucose',
       'Temperature', 'Sodium', 'BUN', 'WBC', 'direct bilirubin', 'Hemoglobin',
       'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine',
       'ALT', 'AST', 'Alkaline Phosphate', 'Sofa', 'Sofa_wo_gcs',
       'Ventilation', 'fluidin', 'fluidout', 'CAM', 'labelrec', 'labelpt',
       'Vasopressor dose'],
      dtype='object')

In [166]:
mean_columns = ['Age', 'Height','Weight']

In [167]:
# mean Imputation of each patient
for i in mean_columns:
    new_df[i].fillna(new_df.groupby("patientunitstayid")[i].transform('mean'),inplace=True)

In [168]:
## Impute with mean of whole cohort
for i in mean_columns:
    new_df[i] = new_df[i].fillna(new_df[i].mean())

In [None]:
new_df.columns

# No. of Patients

In [None]:
new_df.shape

In [None]:
new_df.groupby("patientunitstayid").count().shape


In [None]:
new_df.groupby("uniquepid").count().shape


# Save not imputed data

In [173]:
los = pd.read_csv(os.path.join(eicu_path, 'apachePatientResult.csv'))
los = los[['patientunitstayid','actualiculos']]
los['actualiculos'] = los['actualiculos'] * 24
los.rename(columns={"actualiculos": "LOS"},inplace=True)
new_df_los = pd.merge(new_df, los, how='left', left_on=['patientunitstayid'],right_on=['patientunitstayid'])
new_df_los = new_df_los[new_df_los['LOS']>=24]       
new_df_los = new_df_los[new_df_los['itemoffset'] > 0] #CHANGE TO ZERO
new_df_los_nodups = new_df_los.drop_duplicates()

In [174]:
label_deli = new_df_los_nodups.copy()
label_deli['labelrec'] = np.nan
label_deli.loc[label_deli['CAM']==1,'labelrec']=1
label_deli.loc[label_deli['CAM']==0,'labelrec']=0
label_deli['labelpt'] = np.nan
pos_cam_coh = label_deli[label_deli['labelrec']==1]['patientunitstayid'].unique()
label_deli.loc[label_deli['patientunitstayid'].isin(pos_cam_coh), 'labelpt']=1
label_deli.loc[~(label_deli['patientunitstayid'].isin(pos_cam_coh)), 'labelpt']=0
pos_cam_df = label_deli[label_deli['labelpt']==1]
neg_cam_df = label_deli[label_deli['labelpt']==0]
pos_cam_df.reset_index(inplace=True)
pos_cam_df = pos_cam_df.drop(columns=['index'])
neg_cam_df.reset_index(inplace=True)
neg_cam_df = neg_cam_df.drop(columns=['index'])
neg_cam_df['CAM'] = neg_cam_df['labelpt']
pos_cam_df['CAM'] = pos_cam_df['labelpt']
pos_cam_df.to_csv(os.path.join(data_processed_path, 'pos_eicu_notimputed_24los.csv'), index=False)
neg_cam_df.to_csv(os.path.join(data_processed_path, 'neg_eicu_notimputed_24los.csv'), index=False)

In [175]:
new_df_los_nodups.groupby("patientunitstayid").count().shape

(16546, 33)

In [176]:
new_df_los_nodups.groupby("uniquepid").count().shape

(14228, 33)

In [47]:
zero_columns = ['vent_flag', 'rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine']

In [48]:
new_df[zero_columns] = new_df[zero_columns].fillna(value=0)

In [None]:

# PATIENT WISE ZERO FILL

df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())

for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})

    
    #after Imputation

import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

## Ffill

In [None]:
new_df.columns

In [51]:
forward_columns = ['Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa', 'sofa_wo_gcs','fluidin', 'fluidout']

In [52]:
for i in forward_columns:
    new_df[i] = new_df.groupby("patientunitstayid")[i].transform(lambda v: v.ffill())

In [None]:

# PATIENT WISE FORWARD FILL

df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())

for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})

    
    #after Imputation

import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

# BFill

In [55]:
backward_columns = forward_columns

In [56]:
for i in backward_columns:
    new_df[i] = new_df.groupby("patientunitstayid")[i].transform(lambda v: v.bfill())

In [None]:
new_df.patientunitstayid.nunique(), new_df.shape

## LOS at least 24 

In [64]:
los = pd.read_csv(os.path.join(eicu_path, 'apachePatientResult.csv'))
los = los[['patientunitstayid','actualiculos']]
los['actualiculos'] = los['actualiculos'] * 24
los.rename(columns={"actualiculos": "LOS"},inplace=True)
new_df_los = pd.merge(new_df, los, how='left', left_on=['patientunitstayid'],right_on=['patientunitstayid'])
new_df_los = new_df_los[new_df_los['LOS']>=24]
new_df_los = new_df_los[new_df_los['itemoffset'] > 0] #CHANGE TO ZERO
new_df_los_nodups = new_df_los.drop_duplicates()

In [None]:
new_df_los_nodups.shape

In [None]:
new_df_los_nodups.groupby("patientunitstayid").count().shape

In [None]:
# MISSING RATE After Filtering on 48 hours
df_g = new_df_los_nodups[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())
for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})
    #after Imputation
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

In [None]:
# MISSING RATE After Filtering on 48 hours


import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = columns_order 
percent_missing = new_df_los_nodups[columns].isnull().sum() * 100 / len(new_df_los_nodups)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

## Drop columns with high missing rate (ALT,AST,Alk Ph, Dir Bil)

In [None]:
print(new_df_los_nodups.patientunitstayid.nunique(), new_df_los_nodups.shape)

new_df_los_nodups.drop(columns=['ALT', 'AST','Alkaline Phosphate','direct bilirubin','fluidin', 'fluidout'],inplace=True)

print(new_df_los_nodups.patientunitstayid.nunique())

print(new_df_los_nodups.shape)

In [None]:
print(new_df_los_nodups.shape)

### Missing values dropna 

In [81]:
columns_order = ['patientunitstayid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin',
        'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'sofa', 'sofa_wo_gcs',
       'vent_flag','rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine','LOS','CAM']

## Drop patients with missing values

In [None]:
new_df_los_nodups.dropna(subset=['Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin',
       'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine',
       'sofa', 'sofa_wo_gcs', 'vent_flag', 'rate_dopamine', 'rate_epinephrine',
       'rate_norepinephrine', 'rate_phenylephrine'],inplace=True)

In [87]:
label_deli = new_df_los_nodups.copy()
label_deli['labelrec'] = np.nan
label_deli.loc[label_deli['CAM']==1,'labelrec']=1
label_deli.loc[label_deli['CAM']==0,'labelrec']=0
label_deli['labelpt'] = np.nan


pos_cam_coh = label_deli[label_deli['labelrec']==1]['patientunitstayid'].unique()
label_deli.loc[label_deli['patientunitstayid'].isin(pos_cam_coh), 'labelpt']=1
label_deli.loc[~(label_deli['patientunitstayid'].isin(pos_cam_coh)), 'labelpt']=0


pos_cam_df = label_deli[label_deli['labelpt']==1]
neg_cam_df = label_deli[label_deli['labelpt']==0]
pos_cam_df.reset_index(inplace=True)
pos_cam_df = pos_cam_df.drop(columns=['index'])

neg_cam_df.reset_index(inplace=True)
neg_cam_df = neg_cam_df.drop(columns=['index'])

neg_cam_df['CAM'] = neg_cam_df['labelpt']
pos_cam_df['CAM'] = pos_cam_df['labelpt']

pos_cam_df.to_csv(os.path.join(data_processed_path, 'pos_eicu_imputed_24los.csv'), index=False)
neg_cam_df.to_csv(os.path.join(data_processed_path, 'neg_eicu_imputed_24los.csv'), index=False)


## Save files 

In [89]:
pos_cam_df.to_csv(os.path.join(data_processed_path, 'pos_eicu_imputed_24los.csv'), index=False)
neg_cam_df.to_csv(os.path.join(data_processed_path, 'neg_eicu_imputed_24los.csv'), index=False)


In [None]:
pos_cam_df['patientunitstayid'].nunique()

In [None]:
neg_cam_df['patientunitstayid'].nunique()

In [93]:
eicu_df = pd.concat([neg_cam_df, pos_cam_df],axis=0)