In [75]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [76]:
# Load dataset
df = pd.read_csv('titanic.csv')

In [77]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [78]:
# Sets null values equal to the median of that column
def handle_null_median(df):
    # Need to set inplace=True, so it doesn't create a copy of the dataframe. Tried without and this led to null-values not being removed
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)

    df['Embarked'].fillna('S', inplace=True)

    return  df

df = handle_null_median(df)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,28.0,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [79]:
df['Sex'] = df['Sex'].replace(['female', 'male'], [0,1])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.9250,S
3,1,1,0,35.0,1,0,53.1000,S
4,0,3,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S
887,1,1,0,19.0,0,0,30.0000,S
888,0,3,0,28.0,1,2,23.4500,S
889,1,1,1,26.0,0,0,30.0000,C


In [80]:
def bins(df): # This is kind of feature engineering as well
    df['AgeGroup'] = 0
    df.loc[df['Age'] < 16.336, 'AgeGroup' ] = 0
    df.loc[(df['Age'] >= 16.336) & (df['Age'] < 32.252), 'AgeGroup'] = 1
    df.loc[(df['Age'] >= 32.252) & (df['Age'] < 48.168), 'AgeGroup'] = 2
    df.loc[(df['Age'] >= 48.168) & (df['Age'] < 64.084), 'AgeGroup'] = 3
    df.loc[df['Age'] >= 64.084, 'AgeGroup'] = 4

    # Could also create bins for fare, but not sure Fare is needed.

    return df

df = bins(df)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeGroup
0,0,3,1,22.0,1,0,7.2500,S,1
1,1,1,0,38.0,1,0,71.2833,C,2
2,1,3,0,26.0,0,0,7.9250,S,1
3,1,1,0,35.0,1,0,53.1000,S,2
4,0,3,1,35.0,0,0,8.0500,S,2
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S,1
887,1,1,0,19.0,0,0,30.0000,S,1
888,0,3,0,28.0,1,2,23.4500,S,1
889,1,1,1,26.0,0,0,30.0000,C,1


In [81]:
df['FamilyMembersCount'] = df['Parch'] + df['SibSp'] + 1

In [82]:
df['Pclass'] =  df['Pclass'].astype('category')
df['Embarked'] =  df['Embarked'].astype('category')
df['AgeGroup'] =  df['AgeGroup'].astype('category')

In [123]:
df_XGB = df.copy()
df_RF = df.copy()
df_GNB = df.copy()

In [124]:
df_XGB = pd.get_dummies(df)
df_RF = pd.get_dummies(df)
df_GNB = pd.get_dummies(df)
df_RF

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FamilyMembersCount,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4
0,0,1,22.0,1,0,7.2500,2,0,0,1,0,0,1,0,1,0,0,0
1,1,0,38.0,1,0,71.2833,2,1,0,0,1,0,0,0,0,1,0,0
2,1,0,26.0,0,0,7.9250,1,0,0,1,0,0,1,0,1,0,0,0
3,1,0,35.0,1,0,53.1000,2,1,0,0,0,0,1,0,0,1,0,0
4,0,1,35.0,0,0,8.0500,1,0,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,27.0,0,0,13.0000,1,0,1,0,0,0,1,0,1,0,0,0
887,1,0,19.0,0,0,30.0000,1,1,0,0,0,0,1,0,1,0,0,0
888,0,0,28.0,1,2,23.4500,4,0,0,1,0,0,1,0,1,0,0,0
889,1,1,26.0,0,0,30.0000,1,1,0,0,1,0,0,0,1,0,0,0


In [101]:
from sklearn.model_selection import train_test_split

# XGBoost

In [170]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

train_X, test_X, train_y, test_y = train_test_split(df_XGB.drop('Survived', axis=1), df_XGB['Survived'], random_state=321)
X, y = df_XGB.iloc[:,:-1],df_XGB.iloc[:,:1]

model = XGBClassifier()
model.fit(train_X, train_y)

pred_y = model.predict(test_X)
preds = [round(value) for value in pred_y]

# evaluate predictions
accuracy = accuracy_score(test_y, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 85.20%


In [182]:
pred = pd.DataFrame(preds)
pred.merge(test_X, right_index=True, left_index=True)

Unnamed: 0,0,Sex,Age,SibSp,Parch,Fare,FamilyMembersCount,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4
4,0,1,35.0,0,0,8.05,1,0,0,1,0,0,1,0,0,1,0,0
6,1,1,54.0,0,0,51.8625,1,1,0,0,0,0,1,0,0,0,1,0
7,0,1,2.0,3,1,21.075,5,0,0,1,0,0,1,1,0,0,0,0
13,0,1,39.0,1,5,31.275,7,0,0,1,0,0,1,0,0,1,0,0
23,0,1,28.0,0,0,35.5,1,1,0,0,0,0,1,0,1,0,0,0
24,1,0,8.0,3,1,21.075,5,0,0,1,0,0,1,1,0,0,0,0
25,1,0,38.0,1,5,31.3875,7,0,0,1,0,0,1,0,0,1,0,0
26,0,1,28.0,0,0,7.225,1,0,0,1,1,0,0,0,1,0,0,0
28,0,0,28.0,0,0,7.8792,1,0,0,1,0,1,0,0,1,0,0,0
30,1,1,40.0,0,0,27.7208,1,1,0,0,1,0,0,0,0,1,0,0


# Random Forrest Classifier

In [157]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

train_X, test_X, train_y, test_y = train_test_split(df_RF.drop('Survived', axis=1), df_RF['Survived'], random_state=123)

# Train and test random forest classifier with default parameters
rf_scores = cross_val_score(RandomForestClassifier(), train_X, train_y)
(rf_scores.mean(), rf_scores.std())

(0.7919537650095387, 0.03679628088164086)

# GaussianNB

In [156]:
from sklearn.naive_bayes import GaussianNB

train_X, test_X, train_y, test_y = train_test_split(df_RF.drop('Survived', axis=1), df_RF['Survived'], random_state=175)

# Train and test random forest classifier with default parameters
GNB_scores = cross_val_score(GaussianNB(), train_X, train_y)
(GNB_scores.mean(), GNB_scores.std())

(0.7544158904724497, 0.0624996938350536)