In [1]:
# Importing dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import pymongo

In [2]:
# Creating connection to client
conn  = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Reading in dataframe
train_df = pd.DataFrame(list(client.titanic.train.find()))

train_df.info()
train_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   _id          889 non-null    object
 1   PassengerId  889 non-null    object
 2   Survived     889 non-null    object
 3   Pclass       889 non-null    object
 4   last_name    889 non-null    object
 5   first_name   889 non-null    object
 6   Sex          889 non-null    object
 7   Age          889 non-null    object
 8   SibSp        889 non-null    object
 9   Parch        889 non-null    object
 10  Ticket       889 non-null    object
 11  Fare         889 non-null    object
 12  Embarked     889 non-null    object
 13  Deck         889 non-null    object
dtypes: object(14)
memory usage: 97.4+ KB


Unnamed: 0,_id,PassengerId,Survived,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
0,6503d99734057b3622c141fd,1,0,3,Braund,Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,S,"D,E,F,G"
1,6503d99734057b3622c141fe,2,1,1,Cumings,Mrs John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C,"A,B,C,D,E"
2,6503d99734057b3622c141ff,3,1,3,Heikkinen,Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,S,"D,E,F,G"
3,6503d99734057b3622c14200,4,1,1,Futrelle,Mrs Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,S,"A,B,C,D,E"
4,6503d99734057b3622c14201,5,0,3,Allen,Mr William Henry,male,35.0,0,0,373450,8.05,S,"D,E,F,G"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,6503d99734057b3622c14571,887,0,2,Montvila,Rev Juozas,male,27.0,0,0,211536,13.0,S,"D,E,F,G"
885,6503d99734057b3622c14572,888,1,1,Graham,Miss Margaret Edith,female,19.0,0,0,112053,30.0,S,"A,B,C,D,E"
886,6503d99734057b3622c14573,889,0,3,Johnston,Miss Catherine Helen Carrie,female,7.0,1,2,W./C. 6607,23.45,S,"D,E,F,G"
887,6503d99734057b3622c14574,890,1,1,Behr,Mr Karl Howell,male,26.0,0,0,111369,30.0,C,"A,B,C,D,E"


In [4]:
# Dropping non-beneficial ID columns
train_df_dropped = train_df.drop(['_id', 'PassengerId', 'last_name', 'first_name', 'Ticket'], axis=1)
# Changing data types for integer based columns
train_df_clean = train_df_dropped.astype({'Survived': float, 'Pclass': float, 'Age': float, 
                                          'SibSp': float, 'Parch': float, 'Fare': float})
train_df_clean

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0.0,3.0,male,22.0,1.0,0.0,7.2500,S,"D,E,F,G"
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,"A,B,C,D,E"
2,1.0,3.0,female,26.0,0.0,0.0,7.9250,S,"D,E,F,G"
3,1.0,1.0,female,35.0,1.0,0.0,53.1000,S,"A,B,C,D,E"
4,0.0,3.0,male,35.0,0.0,0.0,8.0500,S,"D,E,F,G"
...,...,...,...,...,...,...,...,...,...
884,0.0,2.0,male,27.0,0.0,0.0,13.0000,S,"D,E,F,G"
885,1.0,1.0,female,19.0,0.0,0.0,30.0000,S,"A,B,C,D,E"
886,0.0,3.0,female,7.0,1.0,2.0,23.4500,S,"D,E,F,G"
887,1.0,1.0,male,26.0,0.0,0.0,30.0000,C,"A,B,C,D,E"


In [5]:
# Looking at the number of unique values in each column for train_df
train_df_dropped.nunique()

Survived      2
Pclass        3
Sex           2
Age          89
SibSp         7
Parch         7
Fare        248
Embarked      3
Deck          2
dtype: int64

In [7]:
# Converting categorical data to numeric with `pd.get_dummies`
train_dummies_df = pd.get_dummies(train_df_clean)
train_dummies_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,"Deck_A,B,C,D,E","Deck_D,E,F,G"
0,0.0,3.0,22.0,1.0,0.0,7.2500,False,True,False,False,True,False,True
1,1.0,1.0,38.0,1.0,0.0,71.2833,True,False,True,False,False,True,False
2,1.0,3.0,26.0,0.0,0.0,7.9250,True,False,False,False,True,False,True
3,1.0,1.0,35.0,1.0,0.0,53.1000,True,False,False,False,True,True,False
4,0.0,3.0,35.0,0.0,0.0,8.0500,False,True,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0.0,2.0,27.0,0.0,0.0,13.0000,False,True,False,False,True,False,True
885,1.0,1.0,19.0,0.0,0.0,30.0000,True,False,False,False,True,True,False
886,0.0,3.0,7.0,1.0,2.0,23.4500,True,False,False,False,True,False,True
887,1.0,1.0,26.0,0.0,0.0,30.0000,False,True,True,False,False,True,False


In [9]:
# Spltting preprocessed data into features and target arrays
X = train_dummies_df.copy()
X.drop('Survived', axis=1, inplace=True)
y = train_dummies_df['Survived']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [10]:
# Creating a StandardScaler instance
scaler = StandardScaler()

# Fiting the StandardScaler
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Defining the model - deep neural net aka the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer_1 = 2
hidden_nodes_layer_2 = 6

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer_1, input_dim=number_input_features, activation='tanh'))
# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer_2, activation='tanh'))
# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
# Checking the structure of the model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2)                 26        
                                                                 
 dense_1 (Dense)             (None, 6)                 18        
                                                                 
 dense_2 (Dense)             (None, 1)                 7         
                                                                 
Total params: 51 (204.00 Byte)
Trainable params: 51 (204.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
# Compiling the model
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Train the model
nn_model.fit(X_train_scaled, y_train, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x207f4fc6ad0>

In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

7/7 - 0s - loss: 0.4793 - accuracy: 0.8072 - 99ms/epoch - 14ms/step
Loss: 0.4792957603931427, Accuracy: 0.8071748614311218


In [15]:
# Exporting model to HDF5 file
nn_model.save('../Models/titanic_predictions_optimization.h5')

  saving_api.save_model(
