In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> ## 1. Import Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
import PIL
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
import keras_tuner as kt

> ## 2.Exploratory Data Analysis

In [None]:
df=pd.read_csv('/kaggle/input/psp-dataset/Patient Survival Detection/Dataset.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df.info(verbose=True,show_counts=True)

In [None]:
df.describe(exclude=object).T

 #### Handling Missing values

In [None]:
# function returns the column names having more than 50% missing values
def missing_values(df):
    missing_df=pd.DataFrame(100*df.isnull().sum()/df.shape[0],columns=['% Missing'])
    return missing_df[missing_df['% Missing']>50]
print(f"{len(missing_values(df).index)} are the columns having missng % greater than 50")
print()
print("THESE BELOW ARE THE COLUMNS NAMES WHICH CAN BE DROPPED")
print(f"{list(missing_values(df).index)}")

<div class="alert alert-danger" style="font-size:14px; font-family:verdana; line-height:  1.7em;"> 

#### Dropping the columns having missing values greater than 50% and These below columns as they dont affect the model
##### 'encounter_id', 'hospital_admit_source','icu_admit_source','icu_id','icu_stay_type', 'patient_id', 'hospital_id', 'readmission_status'.

In [None]:
### 
irrevlant_cols_to_drop=['encounter_id','hospital_admit_source','icu_admit_source','icu_id','icu_stay_type','patient_id', 
            'hospital_id', 'readmission_status']
df.drop(columns=irrevlant_cols_to_drop+list(missing_values(df).index),inplace=True)

#### Body mass index (BMI) is a person’s weight in kilograms divided by the square of height in meters.
* In our data features bmi,height & weight are inter-related.
* So removing the missing values of inter related columns

In [None]:
df=df[df[['bmi','weight','height']].isnull().sum(axis=1)==0]

> #### Target/Label Column

In [None]:
def plot_count(df, col,title_name):
    # Set background color
    plt.rcParams['figure.facecolor'] = '#FFFAF0'
    
    f, ax = plt.subplots(1, 2, figsize=(12, 6))
    plt.subplots_adjust(wspace=0.1)

    s1 = df[col].value_counts()
    N = len(s1)

    outer_sizes = s1
    inner_sizes = s1/N

    outer_colors = ['#9E3F00', '#eb5e00', '#ff781f']
    inner_colors = ['#ff6905', '#ff8838', '#ffa66b']

    ax[0].pie(
        outer_sizes,colors=outer_colors, 
        labels=s1.index.tolist(), 
        startangle=90, frame=True, radius=1.4, 
        explode=([0.05]*(N-1) + [.3]),
        wedgeprops={'linewidth' : 1, 'edgecolor' : 'white'}, 
        textprops={'fontsize': 12, 'weight': 'bold'}
    )

    textprops = {
        'size': 13, 
        'weight': 'bold', 
        'color': 'white'
    }

    ax[0].pie(
        inner_sizes, colors=inner_colors,
        radius=1, startangle=90,
        autopct='%1.f%%', explode=([.1]*(N-1) + [.3]),
        pctdistance=0.8, textprops=textprops
    )

    center_circle = plt.Circle((0,0), .68, color='black', fc='white', linewidth=0)
    ax[0].add_artist(center_circle)

    x = s1
    y = s1.index.tolist()
    sns.barplot(
        x=x, y=y, ax=ax[1],
        palette='YlOrBr_r', orient='horizontal'
    )

    ax[1].spines['top'].set_visible(False)
    ax[1].spines['right'].set_visible(False)
    ax[1].tick_params(
        axis='x',         
        which='both',      
        bottom=False,      
        labelbottom=False
    )

    for i, v in enumerate(s1):
        ax[1].text(v, i+0.1, str(v), color='black', fontweight='bold', fontsize=12)

    plt.setp(ax[1].get_yticklabels(), fontweight="bold")
    plt.setp(ax[1].get_xticklabels(), fontweight="bold")
    ax[1].set_xlabel(col, fontweight="bold", color='black')
    ax[1].set_ylabel('count', fontweight="bold", color='black')

    f.suptitle(f'{title_name}', fontsize=18, fontweight='bold')
    plt.tight_layout()
    plt.show()
    plt.rcParams['figure.facecolor'] = '#FFFFFF'

In [None]:
plot_count(df,'hospital_death','Hospital_death Distribution')

#### Types of ICUs

In [None]:
icu_type = df['icu_type'].value_counts()

plt.figure(figsize=(20, 10))
plt.pie(icu_type.values, labels=icu_type.index,
        startangle=50, autopct='%1.1f%%')
centre_circle = plt.Circle((0, 0), 0.7, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('TYPES OF ICUs', fontdict={
          'fontname': 'Monospace', 'fontsize': 30, 'fontweight': 'bold'})
plt.axis('equal')
plt.legend(prop={'size': 15},loc='upper left')
plt.show()

#### Hospital Death By Gender

In [None]:
Hospital_death_by_ethinicity=df[['gender','hospital_death']].value_counts()
plt.figure(figsize=(20, 10))
plt.pie(Hospital_death_by_ethinicity.values, labels=Hospital_death_by_ethinicity.index,
        startangle=50, autopct='%1.1f%%')
centre_circle = plt.Circle((0, 0), 0.7, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('Death By Gender', fontdict={
          'fontname': 'Monospace', 'fontsize': 30, 'fontweight': 'bold'})
plt.axis('equal')
plt.legend(prop={'size': 15},loc='upper left')
plt.show()

In [None]:
df[['bmi','age','gender']].value_counts()

#### Hospital_Death by Ethnicity 

In [None]:
new=df['hospital_death'].apply(lambda x:'Survived' if x==0 else 'Not Survived')
sns.countplot(x=new,hue=df['ethnicity'])
plt.show()

#### Death Rate in different type of ICUs

In [None]:
ICU_type= df[['icu_type','age','hospital_death']]

ICU_df=ICU_type.groupby(['icu_type','age']).mean().reset_index()
ICU_df['count']=ICU_type.groupby(['icu_type','age']).count().reset_index()['hospital_death']
sns.scatterplot(ICU_df, x="age", y="hospital_death", size="count", hue="icu_type")
plt.show()

#### Density Distribution plot of numerical columns

In [None]:
unpivot = pd.melt(df, df.describe().columns[0], df.describe().columns[1:])
g = sns.FacetGrid(unpivot, col="variable", col_wrap=3, sharex=False, sharey=False)
g.map(sns.kdeplot, "value")
plt.show()

<div style="border-radius:10px; border:#DEB887 solid; padding: 15px; background-color:#FFFAF0; font-size:100%; text-align:left">
<h3 align="left"><font color='#DEB887'>💡 Observations:</font></h3>
    
* Data is highly Imbalanced- 91%(survived) and 9% (Not survived).
* Death Rate by Gender - 4% Female and 4.5% Males are Not survived.
* Med-Surg ICU has more than 50% of dataset.
* Caucasian are the Majority across the both Survived and Non-Survived people.
* Some of the ICU wards have higher death probability pertaining to being surgical wards.
* Most of the numerical columns are normally distributed.  

> ## 3. Data Preprocessing

In [None]:
num_columns=df.select_dtypes(exclude=object).columns.to_list()
print("Number of numerical columns",len(num_columns))
cat_columns=df.select_dtypes(include=object).columns.to_list()
print("Number of categorical Columns",len(cat_columns))

#### Imputing missing values



<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height:  1.7em;">

 Null values for categories are replaced by **Mode**, and those for *numerical* are replaced by **Mean**

In [None]:
df.isna().sum()

In [None]:
imputer=SimpleImputer(strategy='most_frequent')
df.iloc[:,:]=imputer.fit_transform(df)

##after filling missing values
df.isna().sum()

In [None]:
df.head(10)

### One hot encoding 

In [None]:
cat_df=df[cat_columns]
cat_df.head()

In [None]:
df_encoded=pd.get_dummies(df,columns=cat_columns,drop_first=False,prefix='isIN',prefix_sep='_')
df_encoded.reset_index(drop=True,inplace=True)
df_encoded=df_encoded.applymap(lambda x:1 if x==True else (0 if x==False else x))

In [None]:
df_encoded.head()

In [None]:
## features and target columns
y = df_encoded['hospital_death']
X = df_encoded.drop('hospital_death',axis=1)

### Standardizing the train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,  
                                    test_size=0.2,shuffle =True,random_state =99)
print("Shape of training",X_train.shape) 
print("Shape of test",X_test.shape)


# creating an instance for MinMaxScaler
scaler = MinMaxScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# storing the normalized data into a new dataframe
X_train_new = pd.DataFrame(X_train_std, columns= X_train.columns)
X_test_new = pd.DataFrame(X_test_std, columns= X_test.columns)

In [None]:
X_test_new.head()

In [None]:
X_train_new.shape[-1]

> ## 4.Model Building

### Model-1:Creating the Basic Deep Learning Model 

In [None]:
model = Sequential()
# adding first layer and using relu as an activation function
model.add(Dense(64, input_dim=X_train_new.shape[-1], activation='relu'))
# adding second layer to the sequential model
model.add(Dense(32, activation='relu'))
# Adding third layer 
model.add(Dense(16, activation='relu'))
# adding output layer to the model using sigmoid activation function which seems to be ideal for binary classification problem
model.add(Dense(1, activation='sigmoid'))

In [None]:
# printing the model summary just to get an idea about how many paramters are required for the current model
model.summary()

In [None]:
# metrics that might be useful for present project
metrics = [
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.AUC(curve='ROC')
]

### Compiling the model
### choosing an appropirate optimiser and loss function
model.compile(optimizer=keras.optimizers.Adam(0.01),
              loss='binary_crossentropy',
              metrics=metrics)

In [None]:
### Training the model
epochs = 50
batch_size = 32
history = model.fit(X_train_new.values,y_train, 
                    validation_data = (X_test_new.values, y_test), 
                    epochs=epochs, 
                    batch_size=batch_size)

In [None]:
acc = history.history['auc']
val_acc = history.history['val_auc']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training AUC Score')
plt.plot(epochs_range, val_acc, label='Validation AUC Score')
plt.legend(loc='lower right')
plt.title('Training and Validation AUC Score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

#### Conclusion from Model 1:
- There is a large fluctuation in the validation loss using this model.
- The recall evaluation metric is also not giving very good results

### Model 2: Dealing with Class Imbalance
- Class Imbalance is a major issue when dealing with classification problem since one class dominates the other class with a large number.
- Keras can be used to deal with class imbalance problem by specifying the weights to the classes.
- Class with lesser number of samples can be given more weightage as compared to dominant class.

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=X_train_new.shape[-1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
metrics = [
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.AUC(curve='ROC')
]

### Compiling the model
### choosing an appropirate optimiser and loss function
model.compile(optimizer=keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=metrics)

In [None]:
# specifying the weights for the two classes with class 1 given higher weight value than class 0 and then using this class_weight in training the model
class_weight = {0: 0.5, 1: 5}

In [None]:
### Training the model
epochs = 50
batch_size = 32
history = model.fit(X_train_new.values,y_train, 
                    validation_data = (X_test_new.values, y_test), 
                    epochs=epochs, 
                    batch_size=batch_size)


In [None]:
acc = history.history['auc_1']
val_acc = history.history['val_auc_1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training AUC Score')
plt.plot(epochs_range, val_acc, label='Validation AUC Score')
plt.legend(loc='lower right')
plt.title('Training and Validation AUC Score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

#### Conclusion from model 2: 
- The recall score has improved as compared to the previous models.
- The model seems more stable with acceptable AUC score.  

### Model 4: Using Keras Tuner
- Using Keras Tuner hyperparamter tuning can be done in deep learning models as well.
- In this case it is always advisable to focus on loss values of both the training and validation data since a model is considered more stable when the loss decreases with each epoch

In [None]:
def model_builder(hp):
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=[135]))

      # Tune the number of units in the first Dense layer
      # Choose an optimal value between 32-512
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))

      # Tune the learning rate for the optimizer
      # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    metrics = [
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
        keras.metrics.AUC(curve='ROC')]

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                    loss=keras.losses.BinaryCrossentropy(),
                    metrics=metrics)

    return model

In [None]:
tuner = kt.RandomSearch(
    model_builder,
    objective='val_loss',
    max_trials=5)

In [None]:
# using early stopping to stop training the model when optimal value of recall and loss values are achieved.
stop_early = keras.callbacks.EarlyStopping(monitor='val_recall', patience=5)

In [None]:
tuner.search(X_train_new.values, y_train, epochs=30, validation_data = (X_test_new.values, y_test), 
             callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

### Final Model after using Keras Tuner with optimal learning rate and number of units

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height:  1.7em;">
    
#### Number of epochs is also reduced to 15 to avoid the overfitting

In [None]:
model = Sequential()
model.add(Dense(384, input_dim=X_train_new.shape[-1], activation='relu'))
model.add(Dense(384, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

metrics = [
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.AUC(curve='ROC')]

### Compiling the model
### choosing an appropirate optimiser and loss function
model.compile(optimizer=keras.optimizers.Adam(0.001),
              loss='binary_crossentropy',
              metrics=metrics)
### Training the model
epochs = 15
batch_size = 32
history = model.fit(X_train_new.values,y_train, 
                    validation_data = (X_test_new.values, y_test), 
                    epochs=epochs, 
                    batch_size=batch_size)

In [None]:
acc = history.history['auc_2']
val_acc = history.history['val_auc_2']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training AUC Score')
plt.plot(epochs_range, val_acc, label='Validation AUC Score')
plt.legend(loc='lower right')
plt.title('Training and Validation AUC Score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

<div style="border-radius:10px; border:#DEB887 solid; padding: 15px; background-color:#FFFAF0; font-size:100%; text-align:left">
<h3 align="left"><font color='#DEB887'>💡 Conclusion from the Final Model:</font></h3>
    
- The validation loss has decreased significantly.
- The precision and recall metrics are improved to some extent.
- The model is more generalized
- learning rate and number of units used are not random but calculated using proper methods. 