In [None]:
# Final EDA

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None, "display.max_columns", None)
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from numpy import percentile
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
#Load the dataframe

df = pd.read_csv('MIMIC_FINAL_WITH_MISSING_VALUES.csv')

In [None]:
#Visualization of missing values before imputaion to ensure that the missing values account for less that 30% of all 
#the values

df.isnull().mean()*100

In [None]:
#Replace infinite values if any

df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
#Iterative Imputation

imputer = IterativeImputer(max_iter=10)
imputer.fit(df)
df[:]= imputer.transform(df)

In [None]:
#Check if all the missing values have been imputed

df.isnull().mean()*100

In [None]:
# Remove outliers from BMI

Q1 = df.BMI.quantile(0.25)
Q3 = df.BMI.quantile(0.75)
Q1, Q3

IQR = Q3 - Q1
IQR

lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

df = df[(df.BMI>lower_limit)&(df.BMI<upper_limit)]

In [None]:
sns.histplot(data=df, x=df['BMI'])

In [None]:
#Visulize heart rate

sns.histplot(data=df, x=df['HEART_RATE'])

In [None]:
# Remove Outliers from Heart Rate

Q1 = df.HEART_RATE.quantile(0.25)
Q3 = df.HEART_RATE.quantile(0.75)
Q1, Q3

IQR = Q3 - Q1
IQR

lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

df = df[(df.HEART_RATE>lower_limit)&(df.HEART_RATE<upper_limit)]

In [None]:
sns.histplot(data=df, x=df['HEART_RATE'])

In [None]:
#Visulize Oxygen Saturation

sns.histplot(data=df, x=df['OXYGEN_SATURATION'])

In [None]:
# Remove outliers from Oxygen Saturation

Q1 = df.OXYGEN_SATURATION.quantile(0.25)
Q3 = df.OXYGEN_SATURATION.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

df = df[(df.OXYGEN_SATURATION>lower_limit)&(df.OXYGEN_SATURATION<upper_limit)]

In [None]:
sns.histplot(data=df, x=df['OXYGEN_SATURATION'])

In [None]:
#Visualize blood pressure

sns.histplot(data=df, x=df['ARTERIAL_BLOOD_PRESSURE_SYSTOLIC'])

In [None]:
# Remove outliers from Arterial Blood Pressure Systolic

Q1 = df.ARTERIAL_BLOOD_PRESSURE_SYSTOLIC.quantile(0.25)
Q3 = df.ARTERIAL_BLOOD_PRESSURE_SYSTOLIC.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

df = df[(df.ARTERIAL_BLOOD_PRESSURE_SYSTOLIC>lower_limit)&(df.ARTERIAL_BLOOD_PRESSURE_SYSTOLIC<upper_limit)]

In [None]:
# Remove outliers from Arterial Blood Pressure Diastolic

sns.histplot(data=df, x=df['ARTERIAL_BLOOD_PRESSURE_SYSTOLIC'])

In [None]:
# Remove outliers from Arterial Blood Pressure Diastolic

Q1 = df.ARTERIAL_BLOOD_PRESSURE_DIASTOLIC.quantile(0.25)
Q3 = df.ARTERIAL_BLOOD_PRESSURE_DIASTOLIC.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

df = df[(df.ARTERIAL_BLOOD_PRESSURE_DIASTOLIC>lower_limit)&(df.ARTERIAL_BLOOD_PRESSURE_DIASTOLIC<upper_limit)]

In [None]:
sns.histplot(data=df, x=df['ARTERIAL_BLOOD_PRESSURE_DIASTOLIC'])

In [None]:
#Visualize Age

sns.histplot(data=df, x=df['AGE'])

In [None]:
#Visualize Gender and Marital status

df2 = pd.read_csv('ADMISSIONS.csv')
new_df = pd.merge(df, df2,  how='left', left_on=['SUBJECT_ID','HADM_ID'], right_on = ['SUBJECT_ID','HADM_ID'])
plt.figure(figsize=(15,6))
sns.histplot(x="MARITAL_STATUS", data=new_df)
plt.xlabel("Marital Status")
plt.ylabel("Count")

In [None]:
df3 = pd.read_csv('PATIENTS.csv')
new_df = pd.merge(new_df, df3,  how='left', left_on=['SUBJECT_ID'], right_on = ['SUBJECT_ID'])
sns.histplot(data=new_df, x=new_df['GENDER'])

In [None]:
#Visualize death label

sns.histplot(data=new_df, x=new_df['DOD_LABEL'])

In [None]:
#Check co-relation matrix

corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(10, 10))
    ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)

In [None]:
#Save final dataframe after completing imputation and removing outliers

df.to_csv('MIMIC_WITHOUT_OUTLIERS_ITERATIVE_IMPUTATION.csv', index=False)