# Import, loading data and understand the problem 🚤

In [1]:
import matplotlib.pyplot as plt
import dask.dataframe as dd
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

  from pandas import MultiIndex, Int64Index


In [2]:
dd_train = dd.read_csv("data/titanic/train.csv")
dd_train

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,int64,int64,int64,object,object,float64,int64,int64,object,float64,object,object
,...,...,...,...,...,...,...,...,...,...,...,...


The goal is to predict the survived column, so if the passenger survived or not. 

# Cleaning the data and prepare them 🧼 

In [3]:
dd_train['Cabin'] = dd_train['Cabin'].fillna('U')
dd_train['Cabin'] = dd_train['Cabin'].str.get(0)
dd_train['Cabin'] = dd_train['Cabin'].where(dd_train['Cabin'] != 'U', 'Unknown')

#convert categorical to female or not
dd_train['isfemale'] = (dd_train['Sex'] == 'female').astype(int)
dd_train['Pclass'] = dd_train['Pclass'].where(dd_train['Pclass'] == 1, 'First').where(dd_train['Pclass'] == 2, 'Second').where(dd_train['Pclass'] == 3, 'Third')

dd_train['Embarked'] = dd_train['Embarked'].fillna('Unknown')

dd_train['Age'] = dd_train['Age'].fillna(dd_train['Age'].mean().compute())

dd_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isfemale
0,1,0,Second,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0
1,2,1,Third,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,1
2,3,1,Second,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,1
3,4,1,Third,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,1
4,5,0,Second,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0


In [4]:
# map categorical features
dd_train.dropna() 

features = [feat for feat in list(dd_train) 
            if feat != 'Survived']
categorical_features = np.where(dd_train[features].dtypes != np.float64)[0]
 
titanic_catboost_df = dd_train.compute()
X_train, X_test, y_train, y_test = train_test_split(titanic_catboost_df[features], 
                                                    titanic_catboost_df[['Survived']], 
                                                    test_size=0.2, 
                                                     random_state=1)
X_test, X_val, y_test, y_val= train_test_split(X_test, 
                                                    y_test, 
                                                    test_size=0.7, 
                                                     random_state=1)
print("Categorical :",categorical_features)

Categorical : [ 0  1  2  3  5  6  7  9 10 11]


# Algo Choice
We choose to use catboost because it can handle pretty simply our categorical data

# Model Building 🏗️

In [5]:

params = {'iterations':5000,
        'learning_rate':0.005,
        'cat_features':categorical_features,
        'depth':3,
        'eval_metric':'AUC',
        'verbose':200,
        'od_type':"Iter", # overfit detector
        'od_wait':500, # most recent best iteration to wait before stopping
        'random_seed': 1
          }

cat_model = CatBoostClassifier(**params)
cat_model.fit(X_train, y_train,   
          eval_set=(X_val, y_val), 
          use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score
          plot=True  
         )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.8426455	best: 0.8426455 (0)	total: 53ms	remaining: 4m 25s
200:	test: 0.8799376	best: 0.8799376 (199)	total: 384ms	remaining: 9.18s
400:	test: 0.9082640	best: 0.9082640 (397)	total: 726ms	remaining: 8.33s
600:	test: 0.9132017	best: 0.9132017 (600)	total: 1.08s	remaining: 7.91s
800:	test: 0.9141112	best: 0.9147609 (770)	total: 1.41s	remaining: 7.4s
1000:	test: 0.9143711	best: 0.9147609 (770)	total: 1.75s	remaining: 7s
1200:	test: 0.9112526	best: 0.9147609 (770)	total: 2.1s	remaining: 6.65s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.9147609148
bestIteration = 770

Shrink model to first 771 iterations.


<catboost.core.CatBoostClassifier at 0x7fc6d3744b20>

# Modele Evaluation 📝

In [6]:
from sklearn.metrics import recall_score, accuracy_score
y_pred = cat_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test Recall:", recall_score(y_test, y_pred))

Test Accuracy: 0.7169811320754716
Test Recall: 0.5714285714285714


# Improve our model ⚒️
Non traité ici 

# Explain our model 🎙️
We use shap values for provide a unified and consistent explanation of a prediction made by a machine learning model.  
It help us to understand the contribution value to each feature towards the final prediction and so identify any biases in the model.  
Shap Values are model-agnostic meaning they can be applied to any machine learning model. 
Shap values are consistency, locality and symmetry. 

In [7]:
import shap 
from catboost import Pool
shap_values = cat_model.get_feature_importance(Pool(X_test, label=y_test,cat_features=categorical_features) ,
                                               type="ShapValues")
 
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.initjs()
shap.force_plot(expected_value, shap_values[0,:], X_test.iloc[0,:])

SystemError: initialization of _internal failed without raising an exception

In [None]:
shap.summary_plot(shap_values, X_test)

NameError: name 'shap' is not defined