In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
#Load dataframe with one hot encoding and then start merging values from CV and MV databases

df = pd.read_csv('MIMIC_FINAL_WITH_ONE_HOT_ENCODING.csv')

In [None]:
#Calculate Oxygen Saturation

df['Oxygen_Saturation'] = df[['O2 saturation pulseoxymetry', 'SpO2']].mean(axis=1)
df['Oxygen_Saturation_Alarm_High'] = df[['O2 Saturation Pulseoxymetry Alarm - High', 'SpO2 Alarm [High]']].mean(axis=1)
df['Oxygen_Saturation_Alarm_Low'] = df[['O2 Saturation Pulseoxymetry Alarm - Low', 'SpO2 Alarm [Low]']].mean(axis=1)

In [None]:
#Calculate Blood Pressure

df['Arterial_Blood_Pressure_Systolic'] = df[['Arterial BP [Systolic]', 'Arterial BP #2 [Systolic]', 'Arterial Blood Pressure systolic']].mean(axis=1)
df['Arterial_Blood_Pressure_Diastolic'] = df[['Arterial BP [Diastolic]', 'Arterial BP #2 [Diastolic]', 'Arterial Blood Pressure diastolic']].mean(axis=1)
df['Arterial_Blood_Pressure_Mean'] = df[['Arterial BP Mean', 'Arterial BP Mean #2', 'Arterial Blood Pressure mean']].mean(axis=1)

In [None]:
#Drop redundant columns

columns = ['ART Blood Pressure Alarm - High', 'ART Blood Pressure Alarm - Low', 'Arterial BP #2 [Diastolic]','Arterial BP #2 [Systolic]',
          'Arterial BP Mean', 'Arterial BP Mean #2', 'Arterial BP [Diastolic]', 'Arterial BP [Systolic]', 'Arterial Blood Pressure Alarm - High', 'Arterial Blood Pressure Alarm - Low'
          ,'Arterial Blood Pressure diastolic', 'Arterial Blood Pressure mean', 'Arterial Blood Pressure systolic', 'Arterial O2 Saturation', 'Arterial O2 pressure']

df.drop(columns, inplace=True, axis=1)

columns = ['Manual BP Mean(calc)', 'Manual BP [Diastolic]', 'Manual BP [Systolic]','Manual Blood Pressure Diastolic Left',
          'Manual Blood Pressure Diastolic Right', 'Manual Blood Pressure Systolic Left', 'Manual Blood Pressure Systolic Right', 'O2 Saturation Pulseoxymetry Alarm - High', 'O2 Saturation Pulseoxymetry Alarm - Low', 'O2 saturation pulseoxymetry'
          ,'PAO2', 'SpO2', 'SpO2 Alarm [High]', 'SpO2 Alarm [Low]', 'Temporary Pacemaker Rate']

df.drop(columns, inplace=True, axis=1)

columns = ['Admission Weight (Kg)', 'Admit Wt']
df.drop(columns, inplace=True, axis=1)

In [None]:
#Drop redundant columns

del df['Weight Change']

In [None]:
#Have a numerical visualization for missing values in the initial datframe

df.isnull().mean() * 100


In [None]:
#Start with flagging of height and daily weight respectively

df2 = df[['SUBJECT_ID','HADM_ID','GROUP_ID','Height']]
df2 = df2.groupby('SUBJECT_ID', as_index=False).first()
df2['Flag_Height'] = df2['Height'].isnull()*1

#Replace reversed values
df2 = df2.replace({0:1, 1:0})
df3 = pd.merge(df,df2[['SUBJECT_ID','Flag_Height']],on='SUBJECT_ID', how='left')

df4 = df[['SUBJECT_ID','HADM_ID','GROUP_ID','Daily Weight']]
df4 = df4.groupby('SUBJECT_ID', as_index=False).first()
df4['Flag_Daily_Weight'] = df4['Daily Weight'].isnull()*1

#Replace reversed values
df4 = df4.replace({0:1, 1:0})
df4.head()
df5 = pd.merge(df3,df4[['SUBJECT_ID','Flag_Daily_Weight']],on='SUBJECT_ID', how='left')


#Create dataframe that contains these flagged values for future reference
df5.to_csv('MIMIC_FLAGGED_HEIGHT_WEIGHT.csv', index=False)

In [None]:
#Read the flagged dataframe for further analysis

df = pd.read_csv('MIMIC_FLAGGED_HEIGHT_WEIGHT.csv')
df_new = df
#Drop patients where no values are recorded for daily height and weight
df_new = df_new.drop(df_new[df_new.Flag_Height==0].index | df_new[df_new.Flag_Daily_Weight==0].index)

In [None]:
#Visualization after removing patients without daily weight and height

df.isnull().mean() * 100

In [None]:
#Rename columns properly for machine learning process

df_new = df_new.rename(columns={'Daily Weight': 'Daily_Weight'})
df_new = df_new.rename(columns={'Heart Rate': 'Heart_Rate'})
df_new = df_new.rename(columns={'Heart Rate Alarm - Low': 'Heart_Rate_Alarm_Low'})
df_new = df_new.rename(columns={'Heart rate Alarm - High': 'Heart_Rate_Alarm_High'})

In [None]:
#Fill height and daily weight (height has been filled both back and front but daily weight has been filled only in front)

df_new.Daily_Weight=df_new.groupby('SUBJECT_ID').Daily_Weight.apply(lambda x : x.ffill())
df_new.Height=df.groupby('SUBJECT_ID').Height.apply(lambda x : x.ffill().bfill())

In [None]:
#Visualization after filling techniques have been implemented

df_new.isnull().mean() * 100

In [None]:
#Forward fill reamining values based on Subject_ID

df_new.Arterial_Blood_Pressure_Systolic=df_new.groupby('SUBJECT_ID').Arterial_Blood_Pressure_Systolic.apply(lambda x : x.ffill())
df_new.Arterial_Blood_Pressure_Diastolic=df_new.groupby('SUBJECT_ID').Arterial_Blood_Pressure_Diastolic.apply(lambda x : x.ffill())
df_new.Arterial_Blood_Pressure_Mean=df_new.groupby('SUBJECT_ID').Arterial_Blood_Pressure_Mean.apply(lambda x : x.ffill())

In [None]:
#Visualization after filling techniques have been implemented

df_new.isnull().mean() * 100

In [None]:
#Create Flags for Oxygen Saturation Alarms

df_new['Flag_Oxygen_Saturation_Alarm_High'] = df_new['Oxygen_Saturation_Alarm_High'].isnull()*1
df_new['Flag_Oxygen_Saturation_Alarm_Low'] = df_new['Oxygen_Saturation_Alarm_Low'].isnull()*1

In [None]:
#Create Flags for Heart Rate Alarms

df_new['Flag_Heart_Rate_Alarm_Low'] = df_new['Heart_Rate_Alarm_Low'].isnull()*1
df_new['Flag_Heart_Rate_Alarm_High'] = df_new['Heart_Rate_Alarm_High'].isnull()*1

In [None]:
df_new.Flag_Heart_Rate_Alarm_Low = df_new.Flag_Heart_Rate_Alarm_Low.replace({0:1, 1:0})
df_new.Flag_Heart_Rate_Alarm_High = df_new.Flag_Heart_Rate_Alarm_High.replace({0:1, 1:0})

In [None]:
#Create BMI column with daily weight and height

df_new['BMI'] = df_new['Daily_Weight'] / pow(((2.5 * df_new['Height'])/100), 2)

In [None]:
#Convert column names to upper case for better readability

df_new.columns = map(str.upper, df_new.columns)

In [None]:
#Final visualization after filling techniques have been implemented

df_new.isnull().mean() * 100

In [None]:
#Arrange the order of columns

df_new = df_new[['SUBJECT_ID', 'HADM_ID', 'GROUP_ID', 'REL_DAY', 'AGE', 'BMI',
       'GENDER_M', 'MARITAL_STATUS_DIVORCED', 'MARITAL_STATUS_LIFE PARTNER',
       'MARITAL_STATUS_MARRIED', 'MARITAL_STATUS_SEPARATED',
       'MARITAL_STATUS_SINGLE', 'MARITAL_STATUS_UNKNOWN (DEFAULT)',
       'MARITAL_STATUS_WIDOWED', 'HEART_RATE', 'FLAG_HEART_RATE_ALARM_LOW',
       'FLAG_HEART_RATE_ALARM_HIGH','OXYGEN_SATURATION', 'FLAG_OXYGEN_SATURATION_ALARM_HIGH',
       'FLAG_OXYGEN_SATURATION_ALARM_LOW', 'ARTERIAL_BLOOD_PRESSURE_SYSTOLIC',
       'ARTERIAL_BLOOD_PRESSURE_DIASTOLIC', 'ARTERIAL_BLOOD_PRESSURE_MEAN','DOD_LABEL']]

In [None]:
#Save dataframe after missing values have been filled with filling techniques

df_new.to_csv('MIMIC_FINAL_WITH_MISSING_VALUES.csv', index=False)