In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

df = pd.read_excel("../data/titanic3.xls")

In [42]:
# change names
df = df.rename(columns={'pclass': 'Passenger Class', 'survived': 'Survived', 'age': 'Age', 'sibsp': 'Siblings/spouses aboard', 'parch': 'Parents/children aboard', 'fare': 'Passenger fare', 'embarked': 'Port of Embarkation', 'home.dest': 'Home/Destination'})
# create binary columns for sex and alone
df['sex'] = df['sex'].apply(lambda x: 0 if str(x) == 'male' else 1)
df['alone'] = df.apply(lambda row: 1 if row['Parents/children aboard'] == 0 and row['Siblings/spouses aboard'] == 0 else 0, axis=1)
# fill null values for age and fare
df['Age'].fillna(value=df['Age'].mean(), inplace=True)
df['Passenger fare'].fillna(value=df['Passenger fare'].mean(), inplace=True)
# drop unused columns
df.drop(['cabin', 'boat', 'body', 'ticket', 'name', 'Port of Embarkation', 'Home/Destination'], axis=1, inplace=True)

In [43]:
#!pip install pycaret

import pandas as pd
from pycaret.classification import *

s = setup(df, target = 'Survived', session_id = 123)

best = compare_models()

print(best)

evaluate_model(best)

predict_model(best)

predictions = predict_model(best, data=df)
predictions.head()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(1309, 8)"
4,Transformed data shape,"(1309, 8)"
5,Transformed train set shape,"(916, 8)"
6,Transformed test set shape,"(393, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8078,0.861,0.6571,0.807,0.7217,0.5778,0.587,0.027
lightgbm,Light Gradient Boosting Machine,0.7904,0.8514,0.6971,0.7387,0.7148,0.5499,0.5525,0.06
lr,Logistic Regression,0.7882,0.8369,0.6771,0.7542,0.7095,0.544,0.5496,0.009
lda,Linear Discriminant Analysis,0.7882,0.8354,0.6771,0.7496,0.7091,0.5435,0.5474,0.007
qda,Quadratic Discriminant Analysis,0.7872,0.8321,0.6943,0.7409,0.714,0.545,0.5484,0.006
ridge,Ridge Classifier,0.7871,0.0,0.6743,0.7488,0.7072,0.5409,0.5448,0.006
rf,Random Forest Classifier,0.7861,0.8378,0.7,0.7304,0.7128,0.5428,0.545,0.05
ada,Ada Boost Classifier,0.7827,0.8434,0.7,0.7256,0.7097,0.5367,0.5396,0.026
nb,Naive Bayes,0.7773,0.8173,0.7171,0.7113,0.7112,0.5304,0.5335,0.006
et,Extra Trees Classifier,0.7675,0.8125,0.6771,0.7071,0.6874,0.5032,0.5075,0.045


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.799,0.839,0.6667,0.7752,0.7168,0.5624,0.5662


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8556,0.9094,0.734,0.8676,0.7952,0.6849,0.6906


Unnamed: 0,Passenger Class,sex,Age,Siblings/spouses aboard,Parents/children aboard,Passenger fare,alone,Survived,prediction_label,prediction_score
0,1,1,29.0,0,0,211.337494,1,1,1,0.9601
1,1,0,0.9167,1,2,151.550003,0,1,1,0.8784
2,1,1,2.0,1,2,151.550003,0,0,1,0.5196
3,1,0,30.0,1,2,151.550003,0,0,0,0.8493
4,1,1,25.0,1,2,151.550003,0,0,1,0.9598


In [44]:
df_carrot2 = df.copy()

selected_columns = ['sex', 'Age', 'Passenger Class', 'Siblings/spouses aboard', 'Survived']

df_carrot2 = df_carrot2[selected_columns]



In [45]:
s = setup(df_carrot2, target = 'Survived', session_id = 123)

best = compare_models()

print(best)

evaluate_model(best)

predict_model(best)

predictions = predict_model(best, data=df)
predictions.head()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(1309, 5)"
4,Transformed data shape,"(1309, 5)"
5,Transformed train set shape,"(916, 5)"
6,Transformed test set shape,"(393, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.809,0.8524,0.6743,0.7979,0.7277,0.5827,0.5902,0.027
lightgbm,Light Gradient Boosting Machine,0.7937,0.8339,0.66,0.7714,0.707,0.5501,0.5574,0.059
qda,Quadratic Discriminant Analysis,0.7914,0.8381,0.6857,0.7525,0.7148,0.5513,0.5553,0.006
lda,Linear Discriminant Analysis,0.7903,0.833,0.6743,0.7546,0.7101,0.547,0.5509,0.007
ridge,Ridge Classifier,0.7892,0.0,0.6714,0.7539,0.7082,0.5444,0.5484,0.006
lr,Logistic Regression,0.7838,0.8335,0.6686,0.7445,0.7016,0.5333,0.5377,0.008
dt,Decision Tree Classifier,0.7763,0.7767,0.6457,0.7372,0.6857,0.5139,0.5188,0.006
nb,Naive Bayes,0.7696,0.8198,0.6629,0.7212,0.6868,0.5057,0.5102,0.007
ada,Ada Boost Classifier,0.7686,0.8332,0.7057,0.6957,0.698,0.5109,0.5137,0.024
rf,Random Forest Classifier,0.7676,0.8171,0.6629,0.7105,0.6823,0.5002,0.5039,0.051


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7863,0.8355,0.6533,0.7538,0.7,0.5353,0.5386


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8289,0.8849,0.698,0.827,0.757,0.6264,0.6318


Unnamed: 0,Passenger Class,sex,Age,Siblings/spouses aboard,Parents/children aboard,Passenger fare,alone,Survived,prediction_label,prediction_score
0,1,1,29.0,0,0,211.337494,1,1,1,0.9657
1,1,0,0.9167,1,2,151.550003,0,1,1,0.9833
2,1,1,2.0,1,2,151.550003,0,0,0,0.79
3,1,0,30.0,1,2,151.550003,0,0,0,0.676
4,1,1,25.0,1,2,151.550003,0,0,1,0.9715


In [50]:
selected_columns3 = ['sex', 'Age', 'Passenger Class', 'Parents/children aboard', 'Survived']

s = setup(df[selected_columns3], target = 'Survived', session_id = 123)

best = compare_models()

print(best)

evaluate_model(best)

predict_model(best)

predictions = predict_model(best, data=df)
predictions.head()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(1309, 5)"
4,Transformed data shape,"(1309, 5)"
5,Transformed train set shape,"(916, 5)"
6,Transformed test set shape,"(393, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7948,0.8482,0.6314,0.7927,0.7004,0.5478,0.5578,0.024
qda,Quadratic Discriminant Analysis,0.7937,0.8299,0.6686,0.7659,0.7127,0.5529,0.5571,0.007
lr,Logistic Regression,0.7904,0.8289,0.6771,0.7543,0.7112,0.5478,0.5519,0.009
ridge,Ridge Classifier,0.7893,0.0,0.6743,0.7541,0.7094,0.5452,0.5496,0.007
lda,Linear Discriminant Analysis,0.7893,0.8283,0.6743,0.7541,0.7094,0.5452,0.5496,0.007
nb,Naive Bayes,0.7838,0.8253,0.6714,0.7459,0.7033,0.5345,0.5393,0.006
ada,Ada Boost Classifier,0.7828,0.8355,0.7143,0.7196,0.7136,0.5392,0.5425,0.021
et,Extra Trees Classifier,0.7708,0.7906,0.6314,0.7375,0.6756,0.5008,0.5082,0.043
lightgbm,Light Gradient Boosting Machine,0.7707,0.8314,0.6171,0.7461,0.6713,0.4983,0.5067,0.07
dt,Decision Tree Classifier,0.7697,0.7739,0.6229,0.7354,0.6717,0.4969,0.5031,0.007


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7735,0.8344,0.5933,0.7607,0.6667,0.4991,0.508


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8189,0.8784,0.654,0.8363,0.734,0.5999,0.6103


Unnamed: 0,Passenger Class,sex,Age,Siblings/spouses aboard,Parents/children aboard,Passenger fare,alone,Survived,prediction_label,prediction_score
0,1,1,29.0,0,0,211.337494,1,1,1,0.9544
1,1,0,0.9167,1,2,151.550003,0,1,1,0.9861
2,1,1,2.0,1,2,151.550003,0,0,0,0.7087
3,1,0,30.0,1,2,151.550003,0,0,0,0.728
4,1,1,25.0,1,2,151.550003,0,0,1,0.975


In [48]:
selected_columns4 = ['sex', 'Age', 'Passenger Class', 'Passenger fare', 'Survived']


s = setup(df[selected_columns4], target = 'Survived', session_id = 123)

best = compare_models()

print(best)

evaluate_model(best)

predict_model(best)

predictions = predict_model(best, data=df)
predictions.head()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(1309, 5)"
4,Transformed data shape,"(1309, 5)"
5,Transformed train set shape,"(916, 5)"
6,Transformed test set shape,"(393, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8046,0.8637,0.6543,0.7996,0.7178,0.571,0.5791,0.027
lightgbm,Light Gradient Boosting Machine,0.7991,0.8575,0.6943,0.7622,0.724,0.5671,0.571,0.05
ada,Ada Boost Classifier,0.7903,0.84,0.6943,0.7448,0.7162,0.5506,0.5538,0.025
lr,Logistic Regression,0.7838,0.828,0.6771,0.7414,0.7051,0.5354,0.5392,0.012
lda,Linear Discriminant Analysis,0.7805,0.8279,0.6743,0.7349,0.7011,0.5285,0.5317,0.006
et,Extra Trees Classifier,0.7795,0.8144,0.6857,0.7245,0.702,0.5276,0.5305,0.043
ridge,Ridge Classifier,0.7794,0.0,0.6714,0.734,0.6991,0.5259,0.5291,0.007
rf,Random Forest Classifier,0.7774,0.84,0.68,0.7213,0.698,0.5223,0.5247,0.052
dt,Decision Tree Classifier,0.7697,0.7614,0.6914,0.7009,0.6954,0.5104,0.5111,0.006
nb,Naive Bayes,0.7631,0.8057,0.6971,0.6928,0.6923,0.5002,0.5028,0.007


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7888,0.8408,0.6467,0.7638,0.7004,0.539,0.5435


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8495,0.9081,0.732,0.8531,0.7879,0.6724,0.6771


Unnamed: 0,Passenger Class,sex,Age,Siblings/spouses aboard,Parents/children aboard,Passenger fare,alone,Survived,prediction_label,prediction_score
0,1,1,29.0,0,0,211.337494,1,1,1,0.9552
1,1,0,0.9167,1,2,151.550003,0,1,1,0.8607
2,1,1,2.0,1,2,151.550003,0,0,0,0.5865
3,1,0,30.0,1,2,151.550003,0,0,0,0.8505
4,1,1,25.0,1,2,151.550003,0,0,1,0.9557
