In [1]:
### Random forest ###

# read data
import pandas as pd
import numpy as np
titanic = pd.read_csv("titanic.csv", sep = ",", encoding = "ISO-8859-1")

# convert string values to numeric
titanic['Pclass'] = np.where(titanic['Pclass'] == 'Upper', 2, 
                             np.where(titanic['Pclass'] == 'Middle', 1, 0))
titanic['Survived'] = np.where(titanic['Survived'] == 'Survived', 1, 0)
titanic['Sex'] = np.where(titanic['Sex'] == 'female', 1, 0)

# create dummy variables
dummies = pd.get_dummies(titanic['Embarked'])
titanic = pd.concat([titanic, dummies], axis=1)

In [2]:
### Model fit ###

from sklearn.ensemble import RandomForestClassifier as RFC
rf_model = RFC(criterion='entropy' , n_estimators=2000, max_depth=3)
clf = rf_model.fit(titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
                            'Fare', 'EmbarkC', 'EmbarkQ', 'EmbarkS']],
                           titanic['Survived'])

# Confusion matrix

from sklearn.metrics import classification_report
pred = clf.predict(titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
                            'Fare', 'EmbarkC', 'EmbarkQ', 'EmbarkS']])
print(classification_report(titanic['Survived'], pred,
                            target_names=["Perished", "Survived"]))

              precision    recall  f1-score   support

    Perished       0.81      0.93      0.86       810
    Survived       0.85      0.64      0.73       501

    accuracy                           0.82      1311
   macro avg       0.83      0.79      0.80      1311
weighted avg       0.82      0.82      0.81      1311



In [3]:
### Feature engineering ###

# Create a child (Y/N) variable
titanic['Child'] = np.where(titanic['Age'] >= 18, 0, 1)

# Create a 'noble' variable
titanic['Noble'] = 0
titles = [' Dr', ' Rev.', ' Lady', ' Col.', ' Major']

for i in titles:
    titanic['Noble'] = np.where(titanic['Noble'] == 0, 
                                np.where(titanic['Name'].str.contains(i), 1, 0), 1)

# print first five rows
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,EmbarkC,EmbarkQ,EmbarkS,Child,Noble
0,1,0,0,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,EmbarkS,0,0,1,0,0
1,2,1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,EmbarkC,1,0,0,0,0
2,3,1,0,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,EmbarkS,0,0,1,0,0
3,4,1,2,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,EmbarkS,0,0,1,0,0
4,5,0,0,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,EmbarkS,0,0,1,0,0


In [4]:
 ### Model fit after feature engineering ###
    
rf_model = RFC(criterion='entropy' , n_estimators=2000)
clf = rf_model.fit(titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
                            'Fare', 'EmbarkC', 'EmbarkQ', 'EmbarkS']],
                           titanic['Survived'])

# Confusion matrix

from sklearn.metrics import classification_report
pred = clf.predict(titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
                            'Fare', 'EmbarkC', 'EmbarkQ', 'EmbarkS']])
print(classification_report(titanic['Survived'], pred,
                            target_names=["Perished", "Survived"]))

              precision    recall  f1-score   support

    Perished       0.97      0.98      0.97       810
    Survived       0.96      0.95      0.95       501

    accuracy                           0.96      1311
   macro avg       0.96      0.96      0.96      1311
weighted avg       0.96      0.96      0.96      1311



In [6]:
### Training and test data ###

# Split data into training and test

from sklearn.model_selection import train_test_split as tts

# create a set of predictors (Xs) and target (Y)
X = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkC', 
             'EmbarkQ', 'EmbarkS']]
y = titanic['Survived']

# split into 75% training, 25% test
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.25, random_state=42)

# model build (on training)
from sklearn.tree import DecisionTreeClassifier as DTC
tree_model = DTC(criterion='entropy', max_depth=3)
clf = tree_model.fit(X_train, y_train)

# model performance (training data)
from sklearn.metrics import classification_report
print(classification_report(y_train, clf.predict(X_train), 
                            target_names=["Perished", "Survived"]))

# model performance (test data)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test), 
                            target_names=["Perished", "Survived"]))

              precision    recall  f1-score   support

    Perished       0.83      0.87      0.85       604
    Survived       0.78      0.72      0.75       379

    accuracy                           0.81       983
   macro avg       0.81      0.80      0.80       983
weighted avg       0.81      0.81      0.81       983

              precision    recall  f1-score   support

    Perished       0.84      0.89      0.87       206
    Survived       0.80      0.72      0.76       122

    accuracy                           0.83       328
   macro avg       0.82      0.81      0.81       328
weighted avg       0.83      0.83      0.83       328

