### Module Importation and DataFrame Setup

In [1]:
# Reset the model for re-run
from tensorflow.keras import backend as K
K.clear_session()

# Import dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras_tuner as kt
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Read the CSV file from the Dataset folder into a Pandas DataFrame
dfSurvivals = pd.read_csv(
    Path("../CSV_OUTPUT/Base_Cleaned_DS_CSV.csv")
)

In [3]:
# Review the DataFrame
dfSurvivals.head()

Unnamed: 0,Passenger Id,Title,First Name,Last Name,Sex,Age,Sibling/Spouse Aboard,Parent/Children Aboard,Passenger Class,Fare,Embarkation Port,Survival Boat,Body Number,Survived,age_group,family_size
0,1,Miss,Elisabeth Walton,Allen,female,29.0,0,0,1,211.3375,S,2,S,1,Adults,0
1,2,Master,Hudson Trevor,Allison,male,0.9167,1,2,1,151.55,S,11,S,1,Children,3
2,3,Miss,Helen Loraine,Allison,female,2.0,1,2,1,151.55,S,DNS,BNR,0,Children,3
3,4,Mr,Hudson Joshua Creighton,Allison,male,30.0,1,2,1,151.55,S,DNS,135,0,Adults,3
4,5,Mrs,Hudson J C (Bessie Waldo Daniels),Allison,female,25.0,1,2,1,151.55,S,DNS,BNR,0,Adults,3


In [4]:
# Print DataFrame information to ensure no null values and check data types
dfSurvivals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308 entries, 0 to 1307
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Passenger Id            1308 non-null   int64  
 1   Title                   1308 non-null   object 
 2   First Name              1308 non-null   object 
 3   Last Name               1308 non-null   object 
 4   Sex                     1308 non-null   object 
 5   Age                     1308 non-null   float64
 6   Sibling/Spouse Aboard   1308 non-null   int64  
 7   Parent/Children Aboard  1308 non-null   int64  
 8   Passenger Class         1308 non-null   int64  
 9   Fare                    1308 non-null   float64
 10  Embarkation Port        1308 non-null   object 
 11  Survival Boat           1308 non-null   object 
 12  Body Number             1308 non-null   object 
 13  Survived                1308 non-null   int64  
 14  age_group               1308 non-null   

In [5]:
# Preprocessing for features which are currently objects
dfSurvivals = pd.get_dummies(dfSurvivals, columns=["Title", "Sex", "Embarkation Port", "age_group", "Last Name"])

In [7]:
# Drop columns
dfSurvivals = dfSurvivals.drop(['Passenger Id', 'First Name', 'Survival Boat', 'Body Number'], axis=1)
dfSurvivals.head()

Unnamed: 0,Age,Sibling/Spouse Aboard,Parent/Children Aboard,Passenger Class,Fare,Survived,family_size,Title_ Capt,Title_ Col,Title_ Don,...,Last Name_Zabour,Last Name_Zakarian,Last Name_Zimmerman,Last Name_de Brito,Last Name_de Messemaeker,Last Name_de Mulder,Last Name_de Pelsmaeker,Last Name_del Carlo,Last Name_van Billiard,Last Name_van Melkebeke
0,29.0,0,0,1,211.3375,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.9167,1,2,1,151.55,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,1,2,1,151.55,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,30.0,1,2,1,151.55,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,25.0,1,2,1,151.55,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Ensure all columns are numerical
dfSurvivals.dtypes

Age                        float64
Sibling/Spouse Aboard        int64
Parent/Children Aboard       int64
Passenger Class              int64
Fare                       float64
                            ...   
Last Name_de Mulder          uint8
Last Name_de Pelsmaeker      uint8
Last Name_del Carlo          uint8
Last Name_van Billiard       uint8
Last Name_van Melkebeke      uint8
Length: 908, dtype: object

### Oversample to Correct Imbalance in Dataset
#### Oversampling code example used from https://www.section.io/engineering-education/how-to-handle-imbalanced-data-in-deep-neural-networks/#building-the-deep-neural-network-model-using-an-imbalanced-dataset

In [12]:
# Count the number of data samples in each class
count_majority_class, count_minority_class = dfSurvivals.Survived.value_counts()

In [13]:
# Save majority and minority classes in new variables
# In Survived column 0=No and 1=Yes 

df_majority_class = dfSurvivals[dfSurvivals['Survived'] == 0]
df_minority_class = dfSurvivals[dfSurvivals['Survived'] == 1]

In [14]:
# Oversample the minority data
df_class_oversample = df_minority_class.sample(count_majority_class, replace=True)

In [15]:
# Concatenate the oversampled minority class with the majority class
df_balanced_os = pd.concat([df_class_oversample, df_majority_class], axis=0)

In [16]:
# Count number of data samples in new balanced dataset
print('Number of data samples after over-sampling:')
print(df_balanced_os.Survived.value_counts())

Number of data samples after over-sampling:
1    808
0    808
Name: Survived, dtype: int64


In [17]:
# Review the DataFrame
df_balanced_os.head()

Unnamed: 0,Age,Sibling/Spouse Aboard,Parent/Children Aboard,Passenger Class,Fare,Survived,family_size,Title_ Capt,Title_ Col,Title_ Don,...,Last Name_Zabour,Last Name_Zakarian,Last Name_Zimmerman,Last Name_de Brito,Last Name_de Messemaeker,Last Name_de Mulder,Last Name_de Pelsmaeker,Last Name_del Carlo,Last Name_van Billiard,Last Name_van Melkebeke
92,17.0,1,0,1,57.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
258,30.0,0,0,1,31.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50,58.0,0,1,1,512.3292,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
373,28.0,1,0,2,26.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489,42.0,1,0,2,26.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Split the data into X and y and then into testing and training sets

In [18]:
# Split the data into X (features) and y (target)

# Set the y variable, which is the target
y = df_balanced_os['Survived']

# Set the X variable, which includes all features escept the target
X = df_balanced_os.drop(columns=['Survived'])

In [19]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [20]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Compile, Train, and Evaluate the Model

In [21]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

input_dim = input_dim = X_train.shape[1]

def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=input_dim))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 2)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
   
    # Add final layer with sigmoid activation function
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy","Precision","Recall"])

    return nn_model

In [22]:
# Set up the kerastuner then allow it to search for best hyperparameters
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    overwrite=True)

# Initialize EarlyStopping callback to monitor the validation loss
# Training will stop if the validation loss doesn't improve for 3 consecutive epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Perform hyperparameter search
tuner.search(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

Trial 60 Complete [00h 00m 04s]
val_accuracy: 0.8613861203193665

Best val_accuracy So Far: 0.8836633563041687
Total elapsed time: 00h 02m 31s
INFO:tensorflow:Oracle triggered exit


In [23]:
# Get the hyperparameters of the best model
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 1,
 'num_layers': 2,
 'units_0': 3,
 'units_1': 1,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 1,
 'tuner/round': 1,
 'tuner/trial_id': '0019'}

In [24]:
# Get the best model
best_model = tuner.get_best_models(1)[0]
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 908       
                                                                 
 dense_1 (Dense)             (None, 3)                 6         
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
 dense_3 (Dense)             (None, 1)                 2         
                                                                 
Total params: 920 (3.59 KB)
Trainable params: 920 (3.59 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# Train the model
fit_model = best_model.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [26]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_precision, model_recall = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Precision: {model_precision}, Recall: {model_recall}")

13/13 - 0s - loss: 0.4827 - accuracy: 0.8589 - precision: 0.8380 - recall: 0.8916 - 244ms/epoch - 19ms/step
Loss: 0.48270294070243835, Accuracy: 0.8589109182357788, Precision: 0.8379629850387573, Recall: 0.8916256427764893


In [27]:
# Use the model to make predictions on test data
y_pred_prob = best_model.predict(X_test_scaled).flatten()
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[166  35]
 [ 22 181]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       201
           1       0.84      0.89      0.86       203

    accuracy                           0.86       404
   macro avg       0.86      0.86      0.86       404
weighted avg       0.86      0.86      0.86       404



In [29]:
# Export our model to keras file
best_model.save('OUTPUT/Titanic_survival_mod2_reprocessed_opt3.keras')