In [1]:
import pandas as pd
import os, sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
#import titanic dataset and check the shape
training_set = pd.read_csv('titanic_train.csv', index_col=0)
training_set.shape

(891, 11)

In [3]:
#Get information on Data Quality
training_set.info()
training_set.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
#Preprocessing
#drop nulls and bin ages into young, and old
#training_set = training_set.dropna(axis=0)
bins = (0.0, 40.0, 100.0)
group_names = ['young', 'old']
training_set['Bins'] = pd.cut(training_set['Age'], bins = bins, labels = group_names)
#Bin fares into low, medium, and high based on $ ammount
binsfare = (0.0, 30.0, 100.0, 513.0)
fare_names = ['low', 'medium', 'high']
training_set['Fare'] = pd.cut(training_set['Fare'], bins = binsfare, labels = fare_names)

In [5]:
#Remove rows which have NULLS in the training_set['bin'] or training_set['Embarked']
training_set.dropna(subset = ['Bins', 'Embarked'], inplace = True)

In [6]:
#encode bins from 'old' and 'young' into 0 and 1
label_quality = LabelEncoder()
training_set['Bins'] = label_quality.fit_transform(training_set['Bins'])
#encode male and female to 1 and 0
training_set['Sex'] = label_quality.fit_transform(training_set['Sex'])
#categorically encode location of embarkment
X = pd.DataFrame(training_set['Embarked'])
X['Embarked'] = label_quality.fit_transform(X['Embarked'])
#One hot encode location of embarkment in separate dataframe
enc = OneHotEncoder()
enc.fit(X)
onehotlabels = pd.DataFrame(enc.transform(X).toarray())
#1 index onehotlabels
onehotlabels.index +=1
#reset and 1 index training set so join is successful
training_set.reset_index(drop=True, inplace=True)
training_set.index += 1
#Join training set and onehotlabels to get one hot ecoded columns on main dataframe
training_set = training_set.join(onehotlabels)
training_set

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Bins,0,1,2
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,low,,S,1,0.0,0.0,1.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,medium,C85,C,1,1.0,0.0,0.0
3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,low,,S,1,0.0,0.0,1.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,medium,C123,S,1,0.0,0.0,1.0
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,low,,S,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,0,3,"Rice, Mrs. William (Margaret Norton)",0,39.0,0,5,382652,low,,Q,1,0.0,1.0,0.0
709,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,low,,S,1,0.0,0.0,1.0
710,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,low,B42,S,1,0.0,0.0,1.0
711,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,low,C148,C,1,1.0,0.0,0.0


In [7]:
#Drop the Cabin, Name, Ticket, and Embarked columns. Name is a string where little predictive value could be derived and Cabin is 75% NULL
training_set.drop(['Cabin', 'Name', 'Ticket', 'Embarked'], axis=1, inplace = True)

In [8]:
#Rename Bins and one hot encoded columns
training_set.rename(columns = {'Bins':'AgeGroup', 0:'Embark_C', 1:'Embark_Q', 2:'Embark_S'}, inplace = True)
training_set

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,AgeGroup,Embark_C,Embark_Q,Embark_S
1,0,3,1,22.0,1,0,low,1,0.0,0.0,1.0
2,1,1,0,38.0,1,0,medium,1,1.0,0.0,0.0
3,1,3,0,26.0,0,0,low,1,0.0,0.0,1.0
4,1,1,0,35.0,1,0,medium,1,0.0,0.0,1.0
5,0,3,1,35.0,0,0,low,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
708,0,3,0,39.0,0,5,low,1,0.0,1.0,0.0
709,0,2,1,27.0,0,0,low,1,0.0,0.0,1.0
710,1,1,0,19.0,0,0,low,1,0.0,0.0,1.0
711,1,1,1,26.0,0,0,low,1,1.0,0.0,0.0


In [9]:
#Separate dataset as a target variable and feature variables into two dataframes
X = training_set.drop('Survived', axis = 1)
y = training_set['Survived']

In [10]:
#Train and Test splitting of Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [11]:
training_set

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,AgeGroup,Embark_C,Embark_Q,Embark_S
1,0,3,1,22.0,1,0,low,1,0.0,0.0,1.0
2,1,1,0,38.0,1,0,medium,1,1.0,0.0,0.0
3,1,3,0,26.0,0,0,low,1,0.0,0.0,1.0
4,1,1,0,35.0,1,0,medium,1,0.0,0.0,1.0
5,0,3,1,35.0,0,0,low,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
708,0,3,0,39.0,0,5,low,1,0.0,1.0,0.0
709,0,2,1,27.0,0,0,low,1,0.0,0.0,1.0
710,1,1,0,19.0,0,0,low,1,0.0,0.0,1.0
711,1,1,1,26.0,0,0,low,1,1.0,0.0,0.0


# Random Forest Classifier




In [None]:
rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)