In [1]:
from __future__ import absolute_import
from __future__ import print_function
import pandas as pd
import csv
import sys
import os

import numpy as np
import shutil
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

mimic_path = "the directory that indludes mimic-iii csv files"
root_path = "the directory that includes the main csv data"
data_processed_path = "directory to load the extracted data"

In [2]:
all_data_deli = pd.read_csv(os.path.join(data_processed_path, 'all_data_delirium_mimic.csv'))

In [None]:
all_data_deli['GENDER'].value_counts()

In [4]:
all_data_deli.loc[all_data_deli['CAM-ICU MS change'].notnull(),'CAM-ICU MS Change'] = all_data_deli['CAM-ICU MS change']

In [5]:
col_order = ['ICUSTAY_ID','BIN','HOURS','AGE', 'GENDER', 'Height','Weight','PATIENTWEIGHT',
             'Oxygen Saturation', 'Heart Rate','Temperature C', 'Temperature F','WBC',
             'Sodium','BUN','Glucose','direct bilirubin','Hemoglobin','Platelets',
             'Potassium','Chloride','Bicarbonate','Creatinine','ALT','AST','Alkaline Phosphate',
             'Delirium assessment','CAM-ICU MS Change','CAM-ICU Inattention','CAM-ICU Altered LOC',
             'CAM-ICU Disorganized thinking', 
             'CAM-ICU RASS LOC']
            

In [6]:
all_data_deli = all_data_deli[col_order]

In [None]:
all_data_deli.head(1)

In [None]:
all_data_deli.groupby(['ICUSTAY_ID']).head(1).shape

# CAM Positive selection

In [9]:
#Positive CAM-ICU
feature1_pos = all_data_deli['CAM-ICU MS Change']==1
feature2_pos = (all_data_deli['CAM-ICU Inattention']==1) | (all_data_deli['CAM-ICU Inattention']==4)
feature3_pos = all_data_deli['CAM-ICU Altered LOC']==1
feature4_pos = all_data_deli['CAM-ICU Disorganized thinking']==1
cam_pos = all_data_deli[(feature1_pos&feature2_pos)&(feature3_pos|feature4_pos)]

In [10]:
cam_pos = all_data_deli[(feature1_pos&feature2_pos)&(feature3_pos|feature4_pos)]

In [None]:
feature1_pos.sum(),feature2_pos.sum(),feature3_pos.sum(),feature4_pos.sum()

In [None]:
#Statistics of dataset
print("ICU Stays with CAM positive {0}" .format(cam_pos.ICUSTAY_ID.nunique()))
print("Unique ICU Stays {0} \n" .format(all_data_deli.ICUSTAY_ID.nunique()))

print("No. of records with CAM positive {0}" .format(cam_pos.shape[0]))
print("Total No. of records {0}" .format(all_data_deli.shape[0]))

In [None]:
pos_id = cam_pos.ICUSTAY_ID.unique()
cam_pos_df = all_data_deli[all_data_deli['ICUSTAY_ID'].isin(pos_id)]
all_data_deli['CAM'] = np.nan
((feature1_pos&feature2_pos)&(feature3_pos|feature4_pos)).sum()

In [14]:
all_data_deli.loc[((feature1_pos&feature2_pos)&(feature3_pos|feature4_pos)),'CAM']=1
all_data_deli['CAM'].fillna(value=0,inplace=True)
all_data_deli['CAM'].value_counts()

0.0    618127
1.0      5303
Name: CAM, dtype: int64

In [15]:
all_data_deli.columns

Index(['ICUSTAY_ID', 'BIN', 'HOURS', 'AGE', 'GENDER', 'Height', 'Weight',
       'PATIENTWEIGHT', 'Oxygen Saturation', 'Heart Rate', 'Temperature C',
       'Temperature F', 'WBC', 'Sodium', 'BUN', 'Glucose', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'Delirium assessment',
       'CAM-ICU MS Change', 'CAM-ICU Inattention', 'CAM-ICU Altered LOC',
       'CAM-ICU Disorganized thinking', 'CAM-ICU RASS LOC', 'CAM'],
      dtype='object')

In [16]:
all_data_deli.loc[all_data_deli['Weight'].notnull(),'PATIENTWEIGHT'] = all_data_deli['Weight']

In [17]:
def fahr_to_celsius(temp_fahr):
    """Convert Fahrenheit to Celsius
    Return Celsius conversion of input"""
    temp_celsius = (temp_fahr - 32) * 5 / 9
    return temp_celsius

In [None]:
all_data_deli["Temperature F"].describe()

In [19]:
all_data_deli["Temperature F"] = fahr_to_celsius(all_data_deli["Temperature F"])

In [None]:
all_data_deli["Temperature F"].describe()

In [21]:
all_data_deli.loc[all_data_deli['Temperature F'].notnull(),'Temperature C'] = all_data_deli['Temperature F']

In [None]:
all_data_deli[['Temperature F','Temperature C']].describe()

In [None]:
all_data_deli[['PATIENTWEIGHT','Weight']].describe()

In [None]:
all_data_deli.head()

In [None]:
all_data_deli.BIN.describe()

In [None]:
all_data_deli[all_data_deli.BIN < 0].shape

In [30]:
data_copy  = all_data_deli.copy()

In [None]:
data_copy.head()

In [32]:
data_copy.rename(columns={"ICUSTAY_ID": "patientunitstayid", "BIN": "itemoffset",
                          "GENDER": "gender","AGE": "age","Height": "admissionheight","PATIENTWEIGHT": "admissionweight",
                         "Heart Rate": "Heart Rate","Oxygen Saturation": "O2 Saturation","Glucose": "glucose","Temperature C": "Temperature (C)",
                         "Sodium": "sodium","BUN": "BUN","WBC": "WBC x 1000",
                         "Bilirubin": "direct bilirubin"},inplace=True)

In [33]:
def check(x):
    try:
        x = float(str(x).strip())
    except:
        x = np.nan
    return x

def check_itemvalue(df):
    for c in df.columns:
        df[c] = df[c].apply(lambda x: check(x))
    return df

# labelling

In [None]:
data_copy.columns

In [35]:
order_columns = ['patientunitstayid','itemoffset', 
       'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose', 'Temperature (C)',
       'sodium','BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin','Platelets','Potassium', 'Chloride', 'Bicarbonate', 'Creatinine',
       'ALT', 'AST', 'Alkaline Phosphate','CAM']

In [36]:
data_copy = data_copy[data_copy['itemoffset'] > -7]
label_deli = data_copy.copy()
label_deli['labelrec'] = np.nan
label_deli.loc[label_deli['CAM']==1,'labelrec']=1
label_deli.loc[label_deli['CAM']==0,'labelrec']=0
label_deli['labelpt'] = np.nan
pos_cam_coh = label_deli[label_deli['labelrec']==1]['patientunitstayid'].unique()
label_deli.loc[label_deli['patientunitstayid'].isin(pos_cam_coh), 'labelpt']=1
label_deli.loc[~(label_deli['patientunitstayid'].isin(pos_cam_coh)), 'labelpt']=0

In [None]:
label_deli.groupby('patientunitstayid').count().shape

# Add Sofa score to dataframe

In [None]:
data_copy = label_deli[order_columns]
df_mimic = data_copy.copy()
sofa = pd.read_csv(os.path.join(data_processed_path, 'mimic_pivoted_sofa.csv'))
df_mimic['day'] = np.nan
for i in range(-7,1000):
    df_mimic.loc[((df_mimic['itemoffset'] <= i*24) & (df_mimic['itemoffset'] >= (i-1)*24)),'day'] = i  
sofa.rename(columns={'icustay_id':'patientunitstayid'},inplace=True)
set_sofa = set(sofa.patientunitstayid.unique())
set_mimic = set(df_mimic.patientunitstayid.unique())
inters = list(set_sofa.intersection(set_mimic))
print(len(inters))
new_df = pd.merge(df_mimic, sofa, how='left', left_on=['patientunitstayid','day'],right_on=['patientunitstayid','day'])

In [None]:
new_df.head()

In [None]:
new_df.columns

## Add other variables to dataframe

In [41]:
data_copy = new_df
df_mimic = data_copy.copy()
df_vent = pd.read_csv(os.path.join(data_processed_path, 'mimic_wes.csv'))
df_vent.rename(columns={'icustay_id':'patientunitstayid'},inplace=True)
new_df = pd.merge(df_mimic, df_vent, how='left',left_on=['patientunitstayid','itemoffset'],right_on=['patientunitstayid','hr'])

In [None]:
new_df.columns

In [43]:
columns_order = ['patientunitstayid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa', 'sofa_wo_gcs',
       'vent_flag','rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine', 'fluidin', 'fluidout','CAM']

In [44]:
new_df = new_df[columns_order]

## Imputation patient wise for weight and height

In [45]:
for i in ['admissionheight','admissionweight']:
    new_df[i] = new_df.groupby("patientunitstayid")[i].transform(lambda v: v.ffill())
    new_df[i] = new_df.groupby("patientunitstayid")[i].transform(lambda v: v.bfill())

# Missing values

## record-wise

In [None]:
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = columns_order 
percent_missing = new_df[columns].isnull().sum() * 100 / len(new_df)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

## Patient-wise

In [None]:
df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())

for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})

    
    #after Imputation
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

# Correlation

In [48]:
new_df.rename(index=str, columns={"admissionheight": "Height",
                                  "admissionweight":"Weight",
                                  "glucose" : "Glucose",
                                  "sodium" : "Sodium",
                                  "vent_flag" : "Ventilation",
                                  "rate_dopamine" : "Dopamine",
                                  "rate_epinephrine" : "Epinephrine",
                                  "rate_norepinephrine":"Norepinephrine",
                                  "rate_phenylephrine":"Phenylephrine",
                                  "gender":"Gender",
                                  "sofa":"Sofa",
                                  "sofa_wo_gcs":"Sofa_wo_gcs",
                                  "Temperature (C)" : "Temperature",
                                  "WBC x 1000": "WBC",
                                  "age":"Age"}, inplace=True)

In [49]:
new_df['Epinephrine'].fillna(value=0,inplace=True)
new_df['Norepinephrine'].fillna(value=0,inplace=True) 
new_df['Phenylephrine'].fillna(value=0,inplace=True)
new_df['Dopamine'].fillna(value=0,inplace=True)


In [50]:
new_df['Vasopressor dose'] = np.nan
new_df['Vasopressor dose'] = new_df['Epinephrine']+new_df['Norepinephrine'] + new_df['Phenylephrine']/10 + new_df['Dopamine']/2
new_df.drop(columns=['Epinephrine', 'Norepinephrine','Phenylephrine','Dopamine'],inplace=True)


In [None]:
new_df['Vasopressor dose'].notnull().sum()

In [52]:
columns_for_corr = ['Age', 'Height',
       'Weight', 'Heart Rate', 'O2 Saturation', 'Glucose',
       'Temperature', 'Sodium', 'BUN', 'WBC', 
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine','Ventilation','Vasopressor dose','Gender','Sofa', 'Sofa_wo_gcs',  'CAM']

In [None]:
new_df['Vasopressor dose'].notnull().sum()

In [None]:
new_df[columns_for_corr].columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

colormap = plt.cm.RdBu

mask = np.zeros(new_df[columns_for_corr].corr().shape, dtype=bool)
mask[np.tril_indices(len(mask))] = True
mask = ~mask

plt.figure(figsize=(10,10))

sns.set(font_scale=1.4)
plt.title('Pearson Correlation of Features', y=1.05, size=15)

sns.heatmap(new_df[columns_for_corr].corr(), mask = mask, linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=False)

plt.savefig('mimic_corr.png',dpi=450, facecolor='white', bbox_inches = 'tight',transparent=True)
plt.show()

# Save not imputed data

In [None]:
los = pd.read_csv(os.path.join(mimic_path, 'ICUSTAYS.csv'))
los = los[['ICUSTAY_ID','LOS']]
los.head()
los['LOS'] = los['LOS'] * 24
los.rename(columns={"ICUSTAY_ID": "patientunitstayid"},inplace=True)
new_df_los = pd.merge(new_df, los, how='left', left_on=['patientunitstayid'],right_on=['patientunitstayid'])
print(new_df_los.patientunitstayid.nunique())
new_df_los = new_df_los[new_df_los['LOS']>=24] #CHANGE TO 48
new_df_los = new_df_los[new_df_los['itemoffset'] > 0] #CHANGE TO ZERO
new_df_los_nodups = new_df_los.drop_duplicates()

In [None]:
new_df_los_nodups.groupby("patientunitstayid").count().shape

In [48]:
label_deli = new_df_los_nodups.copy()
label_deli['labelrec'] = np.nan
label_deli.loc[label_deli['CAM']==1,'labelrec']=1
label_deli.loc[label_deli['CAM']==0,'labelrec']=0
label_deli['labelpt'] = np.nan
pos_cam_coh = label_deli[label_deli['labelrec']==1]['patientunitstayid'].unique()
label_deli.loc[label_deli['patientunitstayid'].isin(pos_cam_coh), 'labelpt']=1
label_deli.loc[~(label_deli['patientunitstayid'].isin(pos_cam_coh)), 'labelpt']=0
pos_cam_df = label_deli[label_deli['labelpt']==1]
neg_cam_df = label_deli[label_deli['labelpt']==0]
pos_cam_df.reset_index(inplace=True)
pos_cam_df = pos_cam_df.drop(columns=['index'])
neg_cam_df.reset_index(inplace=True)
neg_cam_df = neg_cam_df.drop(columns=['index'])
pos_cam_df.to_csv(os.path.join(data_processed_path, 'pos_mimic_notimputed_24los.csv'), index=False)
neg_cam_df.to_csv(os.path.join(data_processed_path, 'neg_mimic_notimputed_24los.csv'), index=False)

# Imputation

In [60]:
new_df = label_deli.copy()

In [61]:
mean_columns = ['age', 'admissionheight','admissionweight']

In [62]:
# mean Imputation of each patient
for i in mean_columns:
    new_df[i].fillna(new_df.groupby("patientunitstayid")[i].transform('mean'),inplace=True)

In [63]:
## Impute with mean of whole cohort
for i in mean_columns:
    new_df[i] = new_df[i].fillna(new_df[i].mean())

In [None]:
new_df.columns

In [65]:
zero_columns = ['vent_flag', 'rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine']

In [66]:
new_df[zero_columns] = new_df[zero_columns].fillna(value=0)

In [None]:
# PATIENT WISE
df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())
for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

# FFill

In [68]:
forward_columns = ['Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa', 'sofa_wo_gcs','fluidin', 'fluidout']

In [69]:
for i in forward_columns:
    new_df[i] = new_df.groupby("patientunitstayid")[i].transform(lambda v: v.ffill())

In [None]:
# PATIENT WISE

df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())

for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})

import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

# BFill

In [72]:
back_columns = forward_columns

In [73]:
for i in back_columns:
    new_df[i] = new_df.groupby("patientunitstayid")[i].transform(lambda v: v.bfill())

In [None]:
#After Bfill
df_g = new_df[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())

for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})

import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

# Drop columns with high missing rate (ALT,AST,Alk Ph, Dir Bil)


In [None]:
new_df.drop(columns=['ALT', 'AST','Alkaline Phosphate','direct bilirubin','fluidin', 'fluidout'],inplace=True)
print(new_df.patientunitstayid.nunique())
print(new_df.shape)

# Length of stay

In [None]:
los = pd.read_csv(os.path.join(mimic_path, 'ICUSTAYS.csv'))
los = los[['ICUSTAY_ID','LOS']]
los.head()
los['LOS'] = los['LOS'] * 24
los.rename(columns={"ICUSTAY_ID": "patientunitstayid"},inplace=True)
new_df_los = pd.merge(new_df, los, how='left', left_on=['patientunitstayid'],right_on=['patientunitstayid'])
print(new_df_los.patientunitstayid.nunique())
new_df_los = new_df_los[new_df_los['LOS']>=24] #CHANGE TO 48
new_df_los = new_df_los[new_df_los['itemoffset'] > 0] #CHANGE TO ZERO
new_df_los_nodups = new_df_los.drop_duplicates()

## LOS at least 24/48 hours

### Missing values

In [81]:
columns_order = ['patientunitstayid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin',
        'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'sofa', 'sofa_wo_gcs',
       'vent_flag','rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine', 'LOS','CAM']

In [None]:
df_g = new_df_los_nodups[columns_order].groupby("patientunitstayid").apply(lambda x: x.notnull().mean())
for i in df_g.columns:
    df_g[i] = df_g[i].replace({0:np.nan})
#after Imputation
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
columns = df_g.columns
percent_missing = df_g.isnull().sum() * 100 / len(df_g)
missing_value_df = pd.DataFrame({'column_name': columns,'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(inplace=True, drop=True)
missing_value_df

# Drop Patients with missing values

In [83]:
new_df_los_nodups.dropna(subset=['Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin',
       'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine',
       'sofa', 'sofa_wo_gcs', 'vent_flag', 'rate_dopamine', 'rate_epinephrine',
       'rate_norepinephrine', 'rate_phenylephrine'],inplace=True)

In [None]:
new_df_los_nodups.shape

In [None]:
new_df_los_nodups.patientunitstayid.nunique()

### split CAM pos and CAM neg

In [87]:
label_deli = new_df_los_nodups.copy()
label_deli['labelrec'] = np.nan
label_deli.loc[label_deli['CAM']==1,'labelrec']=1
label_deli.loc[label_deli['CAM']==0,'labelrec']=0
label_deli['labelpt'] = np.nan

In [88]:
pos_cam_coh = label_deli[label_deli['labelrec']==1]['patientunitstayid'].unique()
label_deli.loc[label_deli['patientunitstayid'].isin(pos_cam_coh), 'labelpt']=1
label_deli.loc[~(label_deli['patientunitstayid'].isin(pos_cam_coh)), 'labelpt']=0

In [None]:
label_deli.tail(1)

In [90]:
pos_cam_df = label_deli[label_deli['labelpt']==1]
neg_cam_df = label_deli[label_deli['labelpt']==0]
pos_cam_df.reset_index(inplace=True)
pos_cam_df = pos_cam_df.drop(columns=['index'])

neg_cam_df.reset_index(inplace=True)
neg_cam_df = neg_cam_df.drop(columns=['index'])

In [None]:
pos_cam_df['patientunitstayid'].nunique(),neg_cam_df['patientunitstayid'].nunique()

In [92]:
neg_cam_df['CAM'] = neg_cam_df['labelpt']
pos_cam_df['CAM'] = pos_cam_df['labelpt']

In [93]:
pos_cam_df.to_csv(os.path.join(data_processed_path, 'pos_mimic_imputed_24los.csv'), index=False)
neg_cam_df.to_csv(os.path.join(data_processed_path, 'neg_mimic_imputed_24los.csv'), index=False)

In [94]:
mimic_df = pd.concat([neg_cam_df, pos_cam_df],axis=0)

In [None]:
mimic_df.patientunitstayid.nunique()