In [1]:
# John Carther V. Lao
# import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

<font size="5">***Machine Learning Step: Gathering Data***</font>

In [2]:
#read dataset
dataset = pd.read_csv('C:\\Users\\Carther\\Desktop\\COE_prelim\\titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<font size="5">***Machine Learning Step: Preparing Data***</font>

In [3]:
#drop empty column
dataset = dataset.drop(['PassengerId','Name','Ticket','Cabin','Fare','Embarked'], axis=1)
dataset['Sex'] = dataset['Sex'].map({'female':0,'male':1}) 
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,1,22.0,1,0
1,1,1,0,38.0,1,0
2,1,3,0,26.0,0,0
3,1,1,0,35.0,1,0
4,0,3,1,35.0,0,0


In [4]:
#Check for NaN values
display(dataset.isnull().sum())

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
dtype: int64

In [5]:
#Input data in missing values
#take the median of age to avoid skewing parameters
dataset['Age'] = dataset.groupby(['Pclass','Sex'])['Age'].apply(lambda x:x.fillna(x.median()))
#check again for empty values
display(dataset.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
dtype: int64

In [6]:
dataset = dataset[['Pclass','Sex','Age','SibSp','Parch','Survived']]
dataset.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,3,1,22.0,1,0,0
1,1,0,38.0,1,0,1
2,3,0,26.0,0,0,1
3,1,0,35.0,1,0,1
4,3,1,35.0,0,0,0


In [7]:
#separate input data from output
X, y = dataset.loc[:, dataset.columns != 'Survived'], dataset['Survived']

In [8]:
#split data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 5), (712,), (179, 5), (179,))

In [9]:
#scale data to make it easier for model to understand
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

<font size="5">***Machine Learning Step: Choose Model***</font>

***1. Random Forest Model \
2. Logistic Regression \
3. Sequential Neural Network***

<font size="5">***This Section is for Machine Learning Algorithm -- ___Random Forest Model___***</font>

In [10]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Classifier
clf=RandomForestClassifier(criterion = 'gini',
                            n_estimators = 1750,
                            max_depth = 7,
                            random_state = 42,
                            n_jobs=-1,
                            verbose = 0)

#Fit and Train the model using the training sets
clf.fit(X_train,y_train)
#start predictions
y_pred=clf.predict(X_test)

In [11]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# print model accuracy 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 82.68156424581005


<font size="3">***Random Forest Model Accuracy: 82.68%***</font>

<font size="5">***This Section is for Machine Learning Algorithm -- ___Logistic Regression Model___***</font>

In [21]:
#import Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()

#fit and train model
logisticRegr.fit(X_train, y_train)

#start prediction of data
predictions = logisticRegr.predict(X_test)

In [13]:
# get the accuracy
print(metrics.accuracy_score(y_test, predictions)*100)

81.56424581005587


<font size="3">***Logistics Regression Model Accuracy: 81.56%***</font>

<font size="5">***This Section is for Neural Network Model***</font>

In [14]:
#build model
Survivor_model = Sequential()

#Input layer with 5 inputs neurons
Survivor_model.add(Dense(3, kernel_initializer="uniform", activation = 'relu', input_dim = 5))

#Hidden layer
Survivor_model.add(Dense(32, kernel_initializer="uniform", activation = 'relu',))
Survivor_model.add(Dense(16, kernel_initializer="uniform", activation = 'relu'))
Survivor_model.add(Dense(8, kernel_initializer="uniform", activation = 'tanh'))

#output layer with 1 output neuron which will predict 1 or 0
Survivor_model.add(Dense(1, kernel_initializer="uniform", activation = 'sigmoid'))

In [15]:
#compile the model
Survivor_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [16]:
#display model to check layers
Survivor_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 18        
                                                                 
 dense_1 (Dense)             (None, 32)                128       
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 8)                 136       
                                                                 
 dense_4 (Dense)             (None, 1)                 9         
                                                                 
Total params: 819
Trainable params: 819
Non-trainable params: 0
_________________________________________________________________


In [17]:
#fit model and start predictions
Survivor_model_pred = Survivor_model.fit(X_train, y_train, epochs= 100, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [22]:
#display accuracy
_, accuracy = Survivor_model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 83.24


<font size="3">***Neural Network Model Accuracy: 83.24%***</font>

***Results:*** \
***1. Random Forest Model: 82.68% \
2. Logistic Regression: 81.56% \
3. Sequential Neural Network: 83.24%***

<font size="5">***Comparison, Analysis, Conclusion Section***</font>

The project begins by gathering data, here a common dataset was used as the basis of understanding. Rationalizing the problem can be made very easily, deducing that items such as the ticket number, cabin number, and passenger id, played little to no importance in the survival of the passengers. Thus, another data pre-processing step can be made, checking for empty data points in our dataset, this is then filled in with the median to ensure minimal biasing. The data is then split up into training and testing datasets to help in our model testing, this places us in the comparison of 2 machine learning algorithms against a neural network.

In the machine learning models, a Random Forest Classifier was utilized. Although an article [1] indicated that a Decision Tree algorithm would be better suited for this dataset, they indicated that hyperparameter tuning and better data pre-processing could change the accuracy. With this in mind, a variation of a desision tree was used. Here, the parameters used were identified to help optimize the classifier better than the default parameters. It is also the most flexible and easy to use algorithm, evident in the short lines of code used. In comparison to logistic regression, this algorithm does not suffer from overfitting, for it takes the average of all the predictions, which cancels out the biases. In the other algorithm, logistic regression was used, according to another article [2], this model would produce an accuracy of 80.67%. This was slightly increased to 81.56%, a minimal step up, but an increase nevertheless through a different data pre-processing procedure. In the neural network model, 5 layers were used having 2 layers for input an output whilst utilizing the rest for hidden layer. Here, an article [3] was written indicating an accuracy of only 78%, this is improved upon by the neural network designed in this project without using GridSearch a time consuming process.

In conclusion, the 3 artificial intelligence that were designed matched, and on some exceeded the general accuracy that others have reached. This has been the culmination of valuable concepts that will be useful in the following coding activities. In terms of the algorithms and models, improvement could be made by better optimizing the parameters, all achievable with enough time and understanding,as well as adding more features, which might be farfetched in this dataset.

References:

[1]  E. Goel, “Applying 7 classification algorithms on the titanic dataset,” Medium, 01-Jul-2021. [Online]. Available:\
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;https://medium.com/geekculture/applying-7-classification-algorithms-on-the-titanic-dataset-278ef222b53c. [Accessed: 07-Oct-2022].

[2] G. Bektaş “Your guide for logistic regression with titanic dataset,” Medium, 17-Nov-2020. [Online]. Available: \
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;https://medium.com/analytics-vidhya/your-guide-for-logistic-regression-with-titanic-dataset-784943523994. [Accessed: 07-Oct-2022].

[3] Stefanbergstein, “Keras Deep Learning on Titanic Data,” Kaggle, 19-Dec-2017. [Online]. Available: \
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;https://www.kaggle.com/code/stefanbergstein/keras-deep-learning-on-titanic-data. [Accessed: 07-Oct-2022]. 

In [1]:
# Lao, John Carther V. - 1911910