In [2]:
from __future__ import absolute_import
from __future__ import print_function
import matplotlib.pyplot as plt
import pandas as pd
import csv
import sys
import os
import numpy as np
from scipy import stats
from numpy import array
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
data_processed_path = '/media/ehealth/HDD/ICU/DataSets/eICU/Delirium/new_prob_def/data/'
data_processed_path_mimic = "directory to load mimic preprocessed data"
data_processed_path_eicu  = "directory to load eicu preprocessed data"

In [3]:
eicu_pos = pd.read_csv(os.path.join(data_processed_path_eicu,"pos_eicu_imputed_24los.csv"))
eicu_neg = pd.read_csv(os.path.join(data_processed_path_eicu,"neg_eicu_imputed_24los.csv"))
mimic_pos = pd.read_csv(os.path.join(data_processed_path_mimic,"pos_mimic_imputed_24los.csv"))
mimic_neg = pd.read_csv(os.path.join(data_processed_path_mimic,"neg_mimic_imputed_24los.csv"))

In [None]:
mimic_df = pd.concat([mimic_pos, mimic_neg],axis=0)
mimic_df.patientunitstayid.nunique()

In [None]:
eicu_df = pd.concat([eicu_pos, eicu_neg],axis=0)
eicu_df.patientunitstayid.nunique()

In [None]:
eicu_pos.head()

# Delirium onset distribution

In [None]:
tot_df = eicu_df
tot_df = tot_df [['patientunitstayid', 'itemoffset', 'CAM', 'labelrec',
       'LOS']]
onset = tot_df[tot_df['labelrec']==1]
onset = onset.groupby('patientunitstayid').first()
onset.reset_index(inplace=True)
onset['itemoffset'] = onset['itemoffset'] / 24

plt.figure()
plt.xlabel('itemoffset')
onset['itemoffset'].plot.hist(bins=100)
plt.xlabel("Day of admission")
plt.ylabel("Number of patient")
plt.savefig("eicu_delirium",dpi=400, facecolor='white', bbox_inches = 'tight',transparent=True)
plt.show()


In [None]:
onset['itemoffset'].describe()

In [None]:
onset = onset[onset['itemoffset']<=15]

plt.figure()
plt.xlabel('itemoffset')
onset['itemoffset'].plot.hist(bins=100)
plt.xlabel("Day of admission")
plt.ylabel("Number of patient")
plt.savefig("eicu_delirium_max15",dpi=400, facecolor='white', bbox_inches = 'tight',transparent=True)
plt.show()


In [None]:
onset['itemoffset'].describe()

In [None]:
onset = onset[onset['itemoffset']<=2]
onset['itemoffset'].describe()

## MIMIC

In [None]:
# tot_df = pd.concat([eicu_df, mimic_df],axis=0)
tot_df = mimic_df
tot_df = tot_df [['patientunitstayid', 'itemoffset', 'CAM', 'labelrec',
       'LOS']]
onset = tot_df[tot_df['labelrec']==1]
onset = onset.groupby('patientunitstayid').first()
onset.reset_index(inplace=True)
onset['itemoffset'] = onset['itemoffset'] / 24

plt.figure()
plt.xlabel('itemoffset')
onset['itemoffset'].plot.hist(bins=100)
plt.xlabel("Day of admission")
plt.ylabel("Number of patient")
plt.savefig("mimic_delirium",dpi=400, facecolor='white', bbox_inches = 'tight',transparent=True)
plt.show()


In [None]:
onset['itemoffset'].describe()

In [None]:
onset = onset[onset['itemoffset']<=15]

plt.figure()
plt.xlabel('itemoffset')
onset['itemoffset'].plot.hist(bins=100)
plt.xlabel("Day of admission")
plt.ylabel("Number of patient")
plt.savefig("mimic_delirium_max15",dpi=400, facecolor='white', bbox_inches = 'tight',transparent=True)
plt.show()


In [None]:
onset['itemoffset'].describe()

In [None]:
onset = onset[onset['itemoffset']<=2]
onset['itemoffset'].describe()

In [None]:
mimic_pos.groupby('patientunitstayid').first().shape

# Selection

In [9]:
features = ['gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'direct bilirubin',
       'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate',
       'Creatinine', 'ALT', 'AST', 'Alkaline Phosphate', 'sofa', 'sofa_wo_gcs',
       'vent_flag', 'rate_dopamine', 'rate_epinephrine', 'rate_norepinephrine',
       'rate_phenylephrine', 'CAM', 'labelrec',
       'labelpt']

In [10]:
def check(x):
    try:
        x = float(str(x).strip())
    except:
        x = np.nan
    return x

def check_itemvalue(df):
    for c in df.columns:
        df[c] = df[c].apply(lambda x: check(x))
    return df


In [11]:
eicu_df = check_itemvalue(eicu_df)
mimic_df = check_itemvalue(mimic_df)


In [None]:
eicu_df.shape,mimic_df.shape

## No of Patients

In [None]:
eicu_df = eicu_df[eicu_df['gender']!=0]
eicu_df.patientunitstayid.nunique()

In [None]:
mimic_df.patientunitstayid.nunique()

In [None]:
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

column = eicu_df.columns

for col in column:
    ncol = eicu_df[col]
    plt.figure()
    plt.xlabel(col)
    ncol.plot.hist(bins=100)
    plt.show()
    print(ncol.describe())

In [None]:
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

column = mimic_df.columns

for col in column:
    ncol = mimic_df[col]
    plt.figure()
    plt.xlabel(col)
    ncol.plot.hist(bins=100)
    plt.show()
    print(ncol.describe())

In [15]:
def detect_outlier(data,columns):
    
    cols = list(columns)
    
    for col in cols:
        ncol = data[col]
        outliers = []
        threshold = 3.5
        mean = np.mean(ncol)
        std  = np.std(ncol)
        
        for x in ncol:
            z_score= (x - mean) / std 
            if np.abs(z_score) > threshold:
                outliers.append(x)
                
        for y in outliers:
            data = data.drop(data[data[col] == y].index)
    
    return data

In [16]:
columns_for_outlier_mimic = ['admissionheight',  'admissionweight',  'Heart Rate',  'O2 Saturation',  'glucose',  'Temperature (C)',
 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine']



In [None]:
mimic_outlier = detect_outlier(mimic_df,columns_for_outlier_mimic)

In [None]:
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

column = mimic_outlier.columns

for col in column:
    ncol = mimic_outlier[col]
    plt.figure()
    plt.xlabel(col)
    ncol.plot.hist(bins=100)
    plt.show()
    print(ncol.describe())

In [None]:
columns_for_outlier_eicu = ['admissionheight',  'admissionweight',  'Heart Rate',  'O2 Saturation',  'glucose',  'Temperature (C)',
 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin', 'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine']
eicu_outlier = detect_outlier(eicu_df,columns_for_outlier_eicu)

In [None]:
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

column = eicu_outlier.columns

for col in column:
    ncol = eicu_outlier[col]
    plt.figure()
    plt.xlabel(col)
    ncol.plot.hist(bins=100)
    plt.show()
    print(ncol.describe())

In [None]:
eicu_outlier.patientunitstayid.nunique(), mimic_outlier.patientunitstayid.nunique()

## Clipping

In [None]:
def check_in_range(df):
    df['Temperature (C)'].clip(22, 45, inplace=True)
    return df

In [None]:
eicu_df = check_in_range(eicu_outlier)
mimic_df = check_in_range(mimic_outlier)

# Data prepration before normalizing 

In [None]:
eicu_df.columns

### Normalizer for Embedding

In [None]:
categ_col = ['gender','sofa', 'sofa_wo_gcs']


In [None]:
model_columns = ['patientunitstayid', 'itemoffset', 'gender', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin',
       'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine',
       'sofa', 'sofa_wo_gcs', 'vent_flag', 'rate_dopamine', 'rate_epinephrine',
       'rate_norepinephrine', 'rate_phenylephrine', 'CAM','labelrec','labelpt']

In [None]:
eicu_df = eicu_outlier[model_columns]
mimic_df = mimic_outlier[model_columns]

In [None]:
emb = categ_col

In [None]:
eicu_df.shape,mimic_df.shape

In [None]:
eicu_df.patientunitstayid = eicu_df.patientunitstayid.astype(int)
eicu_df.sofa = eicu_df.sofa.astype(int)
eicu_df.sofa_wo_gcs = eicu_df.sofa_wo_gcs.astype(int)
eicu_df.gender = eicu_df.gender.astype(int)

mimic_df.patientunitstayid = mimic_df.patientunitstayid.astype(int)
mimic_df.sofa = mimic_df.sofa.astype(int)
mimic_df.sofa_wo_gcs = mimic_df.sofa_wo_gcs.astype(int)
mimic_df.gender = mimic_df.gender.astype(int)

In [None]:
mimic_df = mimic_df[mimic_df['sofa']<=20]
mimic_df = mimic_df[mimic_df['sofa_wo_gcs']<=17]
eicu_df = eicu_df[eicu_df['sofa']<=20]
eicu_df = eicu_df[eicu_df['sofa_wo_gcs']<=17]

In [None]:
mimic_df[emb].describe().loc[['min','max']]

In [None]:
eicu_df[emb].describe().loc[['min','max']]

In [None]:
mimic_df.patientunitstayid.nunique()

In [None]:
gemax = eicu_df.gender.max() 
sofamax = eicu_df['sofa'].max()+1
sofawogcsmax = eicu_df['sofa_wo_gcs'].max()+2
eicu_df.gender = eicu_df.gender 
eicu_df['sofa'] = eicu_df['sofa'] +gemax+1
eicu_df['sofa_wo_gcs'] = eicu_df['sofa_wo_gcs'] +gemax+sofamax+1

In [None]:
gemax = mimic_df.gender.max() 
sofamax = mimic_df['sofa'].max()+1
sofawogcsmax = mimic_df['sofa_wo_gcs'].max()+2
mimic_df.gender = mimic_df.gender 
mimic_df['sofa'] = mimic_df['sofa'] +gemax+1
mimic_df['sofa_wo_gcs'] = mimic_df['sofa_wo_gcs'] +gemax+sofamax+1

In [None]:
eicu_df[emb].describe().loc[['min','max']]

In [None]:
mimic_df[emb].describe().loc[['min','max']]

In [None]:
train_df = eicu_df
test_df = mimic_df

In [None]:
train_df[train_df['CAM']==1]['patientunitstayid'].nunique(),test_df[test_df['CAM']==1]['patientunitstayid'].nunique()

### Normalizer

In [None]:
from sklearn.preprocessing import MinMaxScaler
norm_col = ['age', 'admissionheight','admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin','Platelets',
        'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine','rate_dopamine', 'rate_epinephrine',
       'rate_norepinephrine', 'rate_phenylephrine']

## Fit the normalizer
feat_train_minmax = train_df[norm_col]
scaler_minmax = MinMaxScaler(feature_range=(0, 1), copy=True).fit(feat_train_minmax.values)

## Transform the normalizer
feat_train_minmax = train_df[norm_col]
feat_train_minmax = scaler_minmax.transform(feat_train_minmax.values)
train_df[norm_col] = feat_train_minmax

In [None]:
scaler_minmax.data_max_

In [None]:
for col, a,b in zip(norm_col, scaler_minmax.data_min_, scaler_minmax.data_max_):
    test_df[col].clip(a, b, inplace=True)
feat_test_minmax = test_df[norm_col]
feat_test_minmax = scaler_minmax.transform(feat_test_minmax.values)
test_df[norm_col] = feat_test_minmax

In [None]:
test_df.columns

In [None]:
test_df.head()

In [None]:
train_df[train_df['patientunitstayid']==1064154]

## Save eICU and MIMIC data

In [51]:
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [52]:
train_df.to_csv("eicu_df_all_24los_normed.csv", index=False)
test_df.to_csv("mimic_df_all_24los_normed.csv", index=False)

In [None]:
test_df[test_df['CAM']==1]['patientunitstayid'].nunique()

In [None]:
train_df[train_df['CAM']==1]['patientunitstayid'].nunique()

In [None]:
test_df['patientunitstayid'].nunique()

In [None]:
train_df['patientunitstayid'].nunique()

In [74]:
tsg  = test_df.groupby('patientunitstayid')
trg = train_df.groupby('patientunitstayid')

idts = []
test_np = []
for idx, frame in tsg:
    idts.append(idx)
    test_np.append(frame)

idtr = []
train_np = []
for idx, frame in trg:
    idtr.append(idx)
    train_np.append(frame)

In [75]:
import sys
import numpy as np
columns_ord = ['patientunitstayid', 'itemoffset', 'gender','sofa', 'sofa_wo_gcs', 'age', 'admissionheight',
       'admissionweight', 'Heart Rate', 'O2 Saturation', 'glucose',
       'Temperature (C)', 'sodium', 'BUN', 'WBC x 1000', 'Hemoglobin',
       'Platelets', 'Potassium', 'Chloride', 'Bicarbonate', 'Creatinine',
        'vent_flag', 'rate_dopamine', 'rate_epinephrine',
       'rate_norepinephrine', 'rate_phenylephrine', 'CAM']

def reader_deli(df_list,verbose=1):
    X_noncat = []
    X_cat = []
    deli = []
    nrows = []
    ts = []
    PID = []
    nb_unit_stays = len(df_list)
    for i, df in enumerate(df_list):
        if verbose:
            sys.stdout.write('\rFeed StayID {0} of {1}...'.format(i+1, nb_unit_stays))
        dft = df
        dummy = pd.DataFrame(columns=columns_ord)
        for c in columns_ord:
            dummy[c] = dft[c]        
        dft = dummy
        narr = np.array(dft)
        pid = narr[0,0]
        x_cat    = narr[:,2:5]
        x_noncat = narr[:, 5:-1]
        labeldeli = narr[0, -1]
        time = narr[:,1]
        X_cat.append(x_cat)
        X_noncat.append(x_noncat)
        deli.append(labeldeli)
        ts.append(time)
        nrows.append(narr.shape[0])
        PID.append(pid)
    PID = np.array(PID)    
    X_cat = np.array(X_cat)
    X_noncat = np.array(X_noncat)
    deli = np.array(deli)
    ts= np.array(ts)
    return PID,X_cat,X_noncat,ts,nrows,deli

In [76]:
deli_path = data_processed_path 

In [None]:
PID,X_cat_ts_all,X_noncat_ts_all,ts_ts_all,nrows_ts_all,y_ts_all = reader_deli(test_np)

In [79]:
def pad(arr, max_len=24):
    tmp = np.zeros((max_len, arr.shape[1]))
    tmp[:arr.shape[0], :arr.shape[1]] = arr
    return tmp  

xtsnc = []
xtsc = []

for xn_ts in X_noncat_ts:
    tsnc.append(pad(xn_ts))
for xc_tr in X_cat_ts:
    xtsc.append(pad(xc_tr))

xtsnc = np.array(tsnc)
xtsc = np.array(tsc)

np.savez(os.path.join(deli_path, 'mimic_pred_24los.npz'), PID = PID ,X_cat= tsc, X_noncat=tsnc, nrows = nrows_ts_all ,deli=y_ts_all)

In [None]:
test_file = np.load(os.path.join(deli_path, 'mimic_pred_24los.npz'))
test_file['PID'].shape,test_file['X_cat'].shape, test_file['X_noncat'].shape, test_file['deli'].shape,test_file['nrows'].shape

In [None]:
PID,X_cat_tr_all,X_noncat_tr_all,ts_tr_all,nrows_tr_all,y_tr_all = reader_deli(train_np)

In [82]:
def pad(narr, max_len=24):
    tmp = np.zeros((max_len, narr.shape[1]))
    tmp[:narr.shape[0], :narr.shape[1]] = narr
    return tmp  
trnc = []
for xn_tr in X_noncat_tr_all:
    trnc.append(pad(xn_tr))
trnc = np.array(trnc)
trc = []
for xc_tr in X_cat_tr_all:
    trc.append(pad(xc_tr))
trc = np.array(trc)
np.savez(os.path.join(deli_path, 'eicu_pred_24los.npz'),PID = PID , X_cat= trc, X_noncat=trnc,nrows = nrows_tr_all, deli=y_tr_all)

In [None]:
train_file = np.load(os.path.join(deli_path, 'eicu_pred_24los.npz'))
test_file['PID'].shape,train_file['X_cat'].shape, train_file['X_noncat'].shape, train_file['deli'].shape,train_file['nrows'].shape,train_file['deli'].shape