In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

df = pd.read_excel("../data/titanic3.xls")

# change names
df = df.rename(columns={'pclass': 'Passenger Class', 'survived': 'Survived', 'age': 'Age', 'sibsp': 'Siblings/spouses aboard', 'parch': 'Parents/children aboard', 'fare': 'Passenger fare', 'embarked': 'Port of Embarkation', 'home.dest': 'Home/Destination'})
# create binary columns for sex and alone
df['sex'] = df['sex'].apply(lambda x: 0 if str(x) == 'male' else 1)
df['alone'] = df.apply(lambda row: 1 if row['Parents/children aboard'] == 0 and row['Siblings/spouses aboard'] == 0 else 0, axis=1)
# fill null values for age and fare
df['Age'].fillna(value=df['Age'].mean(), inplace=True)
df['Passenger fare'].fillna(value=df['Passenger fare'].mean(), inplace=True)
# drop unused columns
df.drop(['cabin', 'boat', 'body', 'ticket', 'name', 'Port of Embarkation', 'Home/Destination'], axis=1, inplace=True)

#change order of columns
new_order = ['sex', 'Survived', 'alone', 'Siblings/spouses aboard', 'Parents/children aboard', 'Passenger fare']
df = df[new_order]

df.head(5)

Unnamed: 0,sex,Survived,alone,Siblings/spouses aboard,Parents/children aboard,Passenger fare
0,1,1,1,0,0,211.3375
1,0,1,0,1,2,151.55
2,1,0,0,1,2,151.55
3,0,0,0,1,2,151.55
4,1,0,0,1,2,151.55


In [38]:
from pycaret.classification import *
s = setup(df, target = 'Survived')

Unnamed: 0,Description,Value
0,Session id,4980
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(1309, 6)"
4,Transformed data shape,"(1309, 6)"
5,Transformed train set shape,"(916, 6)"
6,Transformed test set shape,"(393, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


In [44]:
best = compare_models()

In [40]:
print(best)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=4980, verbose=0, warm_start=False)


In [41]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [42]:
#predict_model(best)

In [45]:
predict_model(best, data=df.head(10))

Unnamed: 0,sex,alone,Siblings/spouses aboard,Parents/children aboard,Passenger fare,Survived,prediction_label,prediction_score
0,1,1,0,0,211.337494,1,1,1.0
1,0,0,1,2,151.550003,1,1,0.5105
2,1,0,1,2,151.550003,0,0,0.82
3,0,0,1,2,151.550003,0,1,0.5105
4,1,0,1,2,151.550003,0,0,0.82
5,0,1,0,0,26.549999,1,0,0.6193
6,1,0,1,0,77.958298,1,1,1.0
7,0,1,0,0,0.0,0,0,0.9225
8,1,0,2,0,51.479198,1,1,0.99
9,0,1,0,0,49.5042,0,0,0.99
