# Classification of passengers that survived the Titanic catastrophe
Using 3 ML techniques:
 - SVM
 - Decision Trees
 - Random Forest


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree



In [4]:
titanic = pd.read_csv(r'TitanicSurvival.csv')
titanic.head()

Unnamed: 0.1,Unnamed: 0,survived,sex,age,passengerClass
0,"Allen, Miss. Elisabeth Walton",yes,female,29.0,1st
1,"Allison, Master. Hudson Trevor",yes,male,0.9167,1st
2,"Allison, Miss. Helen Loraine",no,female,2.0,1st
3,"Allison, Mr. Hudson Joshua Crei",no,male,30.0,1st
4,"Allison, Mrs. Hudson J C (Bessi",no,female,25.0,1st


In [5]:
titanic.columns = ["Name", "Survived", "Sex", "Age", "Class"]

In [6]:
titanic.loc[titanic["Sex"]=="male", "Sex"] = 0
titanic.loc[titanic["Sex"]=="female", "Sex"] = 1

titanic.loc[titanic["Survived"]=="yes", "Survived"] = 1
titanic.loc[titanic["Survived"]=="no", "Survived"] = 0

titanic.loc[titanic["Class"]=="1st", "Class"] = 1
titanic.loc[titanic["Class"]=="2nd", "Class"] = 2
titanic.loc[titanic["Class"]=="3rd", "Class"] = 3

In [7]:
titanic[["Class", "Sex"]] = titanic[["Class", "Sex"]].astype(object) 
titanic.loc[:,"Survived"] = titanic.loc[:,"Survived"].astype(int)

In [8]:
titanic.dtypes

Name         object
Survived      int32
Sex          object
Age         float64
Class        object
dtype: object

In [9]:
titanic.head()

Unnamed: 0,Name,Survived,Sex,Age,Class
0,"Allen, Miss. Elisabeth Walton",1,1,29.0,1
1,"Allison, Master. Hudson Trevor",1,0,0.9167,1
2,"Allison, Miss. Helen Loraine",0,1,2.0,1
3,"Allison, Mr. Hudson Joshua Crei",0,0,30.0,1
4,"Allison, Mrs. Hudson J C (Bessi",0,1,25.0,1


In [10]:
titanic.loc[titanic["Age"].isna()]

Unnamed: 0,Name,Survived,Sex,Age,Class
15,"Baumann, Mr. John D",0,0,,1
37,"Bradley, Mr. George (George Ar",1,0,,1
40,"Brewe, Dr. Arthur Jackson",0,0,,1
46,"Cairns, Mr. Alexander",0,0,,1
59,"Cassebeer, Mrs. Henry Arthur Jr",1,1,,1
...,...,...,...,...,...
1293,"Williams, Mr. Howard Hugh Harr",0,0,,3
1297,"Wiseman, Mr. Phillippe",0,0,,3
1302,"Yousif, Mr. Wazli",0,0,,3
1303,"Yousseff, Mr. Gerious",0,0,,3


Dropping NA values

In [11]:
titanic = titanic.dropna()

In [13]:
y = titanic["Survived"]
X = titanic.drop(["Survived", "Name"], axis = 1) #dropping name because it is not a valid feature for the models

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### SVM (Support Vector Machines)

In [18]:
model_SVM = svm.SVC(kernel='linear')
model_SVM.fit(X_train, y_train)
y_pred = model_SVM.predict(X_test)
accuracy = model_SVM.score(X_test, y_test)
print(f'Accuracy of the SVM model: {accuracy*100:.2f}%')

Accuracy of the SVM model: 72.38%


testing different SVM kernels

In [30]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
accuracy = {}

for kernel in kernels:
    model_SVM = svm.SVC(kernel=kernel)
    model_SVM.fit(X_train, y_train)
    accuracy[kernel] = model_SVM.score(X_test, y_test)
    print(f'Accuracy of the SVM model for {kernel} kernel: {accuracy[kernel]*100:.2f}%')

Accuracy of the SVM model for linear kernel: 72.38%
Accuracy of the SVM model for poly kernel: 63.33%
Accuracy of the SVM model for rbf kernel: 57.14%
Accuracy of the SVM model for sigmoid kernel: 51.43%


Model reaches the highest accuracy for the linear kernel.

### Decision Trees

In [31]:
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
accuracy = model_dt.score(X_test, y_test)
print(f'Accuracy of the Decision Tree model: {accuracy*100:.2f}%')

Accuracy of the Decision Tree model: 70.95%


### Random Forest

In [32]:
model_rf  = RandomForestClassifier(n_estimators = 100) 
model_rf.fit(X_train, y_train)
accuracy = model_rf.score(X_test, y_test)
print(f'Accuracy of the Random Forest model: {accuracy*100:.2f}%')

Accuracy of the Random Forest model: 72.38%


The best accuracy is obtained for the SVM model with a linear kernel and for the Random Forest. 