In [1]:
import pandas as pd
import os, sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
#import titanic dataset and check the shape
training_set = pd.read_csv('titanic_train.csv', index_col=0)
training_set.shape

(891, 11)

In [3]:
#Get information on Data Quality
training_set.info()
training_set.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# Preprocessing
# drop nulls and bin ages into young, and old
training_set = training_set.dropna(axis=0)
bins = (0.0, 40.0, 100.0)
group_names = ['young', 'old']
training_set.loc[:,'Bins'] = pd.cut(training_set.loc[:,'Age'], bins = bins, labels = group_names)
#Bin fares into low, medium, and high based on $ ammount
binsfare = (0.0, 30.0, 100.0, 513.0)
fare_names = ['low', 'medium', 'high']
training_set.loc[:,'Fare'] = pd.cut(training_set.loc[:,'Fare'], bins = binsfare, labels = fare_names)
training_set

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Bins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,medium,C85,C,young
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,medium,C123,S,young
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,medium,E46,S,old
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,low,G6,S,young
12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,low,C103,S,old
...,...,...,...,...,...,...,...,...,...,...,...,...
872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,medium,D35,S,old
873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,low,B51 B53 B55,S,young
880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,medium,C50,C,old
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,low,B42,S,young


In [5]:
#Remove rows which have NULLS in the training_set['bin'] or training_set['Embarked']
training_set.dropna(subset = ['Bins', 'Embarked'], inplace = True)

In [6]:
#encode bins from 'old' and 'young' into 0 and 1
label_quality = LabelEncoder()
training_set.loc[:,'Bins'] = label_quality.fit_transform(training_set.loc[:,'Bins'])
#encode male and female to 1 and 0
training_set.loc[:,'Sex'] = label_quality.fit_transform(training_set.loc[:,'Sex'])
#categorically encode location of embarkment
X = pd.DataFrame(training_set.loc[:,'Embarked'])
X.loc[:,'Embarked'] = label_quality.fit_transform(X.loc[:,'Embarked'])
#One hot encode location of embarkment in separate dataframe
enc = OneHotEncoder()
enc.fit(X)
onehotlabels = pd.DataFrame(enc.transform(X).toarray())
#1 index onehotlabels
onehotlabels.index +=1
#reset and 1 index training set so join is successful
training_set.reset_index(drop=True, inplace=True)
training_set.index += 1
#Join training set and onehotlabels to get one hot ecoded columns on main dataframe
training_set = training_set.join(onehotlabels)
training_set

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Bins,0,1,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,medium,C85,C,1,1.0,0.0,0.0
2,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,medium,C123,S,1,0.0,0.0,1.0
3,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,medium,E46,S,0,0.0,0.0,1.0
4,1,3,"Sandstrom, Miss. Marguerite Rut",0,4.0,1,1,PP 9549,low,G6,S,1,0.0,0.0,1.0
5,1,1,"Bonnell, Miss. Elizabeth",0,58.0,0,0,113783,low,C103,S,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",0,47.0,1,1,11751,medium,D35,S,0,0.0,0.0,1.0
180,0,1,"Carlsson, Mr. Frans Olof",1,33.0,0,0,695,low,B51 B53 B55,S,1,0.0,0.0,1.0
181,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",0,56.0,0,1,11767,medium,C50,C,0,1.0,0.0,0.0
182,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,low,B42,S,1,0.0,0.0,1.0


In [7]:
#Drop the Cabin, Name, Ticket, and Embarked columns. Name is a string where little predictive value could be derived and Cabin is 75% NULL
training_set.drop(['Cabin', 'Name', 'Ticket', 'Embarked'], axis=1, inplace = True)
#Rename Bins and one hot encoded columns
training_set.rename(columns = {'Bins':'AgeGroup', 0:'Embark_C', 1:'Embark_Q', 2:'Embark_S'}, inplace = True)
training_set
# #categorically encode location of embarkment
X = pd.DataFrame(training_set.loc[:,'Fare'])
X.loc[:,'Fare'] = X.loc[:,'Fare'].astype(str)
X.loc[:,'Fare'] = label_quality.fit_transform(X.loc[:,'Fare'])
#One hot encode location of embarkment in separate dataframe
enc.fit(X)
onehotlabels = pd.DataFrame(enc.transform(X).toarray())
#1 index onehotlabels
onehotlabels.index +=1
#reset and 1 index training set so join is successful
training_set.reset_index(drop=True, inplace=True)
training_set.index += 1
onehotlabels
#Join training set and onehotlabels to get one hot ecoded columns on main dataframe
training_set = training_set.join(onehotlabels)
#Drop null column and rename newly one hot encoded fare level columns 1, 2, 3
training_set.drop(['Fare'], axis=1, inplace = True)
training_set.rename(columns = {1:'Low', 2:'Medium', 3:'High'}, inplace = True)


In [8]:
training_set

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,AgeGroup,Embark_C,Embark_Q,Embark_S,0,Low,Medium,High
1,1,1,0,38.0,1,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,1,0,35.0,1,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,1,1,54.0,0,0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,3,0,4.0,1,1,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,1,1,0,58.0,0,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,1,1,0,47.0,1,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
180,0,1,1,33.0,0,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
181,1,1,0,56.0,0,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
182,1,1,0,19.0,0,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [22]:
training_set
training_set.head(50)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,AgeGroup,Embark_C,Embark_Q,Embark_S,0,Low,Medium,High
1,1,1,0,38.0,1,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,1,0,35.0,1,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,1,1,54.0,0,0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,3,0,4.0,1,1,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,1,1,0,58.0,0,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6,1,2,1,34.0,0,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7,1,1,1,28.0,0,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,0,1,1,19.0,3,2,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0
9,1,1,0,49.0,1,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
10,0,1,1,65.0,0,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
#Separate dataset as a target variable and feature variables into two dataframes
X = training_set.drop('Survived', axis = 1)
y = training_set['Survived']

In [16]:
#Train and Test splitting of Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Random Forest Classifier




In [17]:
rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [18]:
print(confusion_matrix(y_test, pred_rfc))

[[ 6  8]
 [ 3 20]]


In [19]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.67      0.43      0.52        14
           1       0.71      0.87      0.78        23

    accuracy                           0.70        37
   macro avg       0.69      0.65      0.65        37
weighted avg       0.70      0.70      0.68        37

