## Titanic survival

### Read data

In [1]:
import pandas as pd
from ceteris_paribus.datasets import DATASETS_DIR
import os
df = pd.read_csv(os.path.join(DATASETS_DIR, 'titanic_train.csv'))

In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y = df['Survived']
x = df.drop(['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket'], inplace=False, axis=1)

In [4]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [5]:
valid = x['Age'].isnull() | x['Embarked'].isnull()
x = x[-valid]
y = y[-valid]

In [6]:
x['Pclass'] = x['Pclass'].astype('float64')
x['SibSp'] = x['SibSp'].astype('float64')
x['Parch'] = x['Parch'].astype('float64')

In [7]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.0,male,22.0,1.0,0.0,7.25,S
1,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,female,26.0,0.0,0.0,7.925,S
3,1.0,female,35.0,1.0,0.0,53.1,S
4,3.0,male,35.0,0.0,0.0,8.05,S


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Building the models

In [53]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [54]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [55]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
xgb_clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBClassifier())])

In [107]:
rf_clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, min_samples_leaf=2))])

In [108]:
linear_clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])

### Train the models

In [109]:
xgb_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['Pclass', 'Age', 'SibSp', 'Parch...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [110]:
rf_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['Pclass', 'Age', 'SibSp', 'Parch...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [111]:
linear_clf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['Pclass', 'Age', 'SibSp', 'Parch...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

### Evaluate the models

In [112]:
xgb_clf.classes_

array([0, 1])

In [113]:
from sklearn.metrics import accuracy_score
print("XGB {}".format(accuracy_score(y_test, xgb_clf.predict(X_test))))
print("Random Forest {}".format(accuracy_score(y_test, rf_clf.predict(X_test))))
print("Linear {}".format(accuracy_score(y_test, linear_clf.predict(X_test))))

XGB 0.8041958041958042
Random Forest 0.7832167832167832
Linear 0.7972027972027972


### Explain the models

In [114]:
from ceteris_paribus.explainer import explain

explainer_xgb = explain(xgb_clf, data=x, y=y, label='XGBoost', predict_function=lambda X: xgb_clf.predict_proba(X)[::, 1])
explainer_rf = explain(rf_clf, data=x, y=y, label='RandomForest', predict_function=lambda X: rf_clf.predict_proba(X)[::, 1])
explainer_linear = explain(linear_clf, data=x, y=y, label='LogisticRegression', predict_function=lambda X: linear_clf.predict_proba(X)[::, 1])

##### Ernest James Crease

In [115]:
import warnings
import sklearn
warnings.filterwarnings("ignore", category=sklearn.exceptions.DataConversionWarning)
ernest = X_test.iloc[10]
label_ernest = y_test.iloc[10]
print("Referenced observation \n{}".format(ernest))
from ceteris_paribus.profiles import individual_variable_profile
cp_xgb = individual_variable_profile(explainer_xgb, ernest, label_ernest)
cp_rf = individual_variable_profile(explainer_rf, ernest, label_ernest)
cp_linear = individual_variable_profile(explainer_linear, ernest, label_ernest)

Referenced observation 
Pclass           3
Sex           male
Age             19
SibSp            0
Parch            0
Fare        8.1583
Embarked         S
Name: 67, dtype: object


In [116]:
from ceteris_paribus.plots.plots import plot_notebook, plot

In [117]:
plot_notebook(cp_xgb, selected_variables=["Age"], width=700, height=800, show_rugs=True, size=4)

In [120]:
plot_notebook(cp_xgb, cp_rf, cp_linear, selected_variables=["Age"], width=650, height=800, size=3)

##### Miss. Elizabeth Mussey Eustis

In [30]:
elizabeth = X_test.iloc[1]
print(elizabeth)
label_elizabeth = y_test.iloc[1]
cp_xgb_2 = individual_variable_profile(explainer_xgb, elizabeth, label_elizabeth)

Pclass            1
Sex          female
Age              54
SibSp             1
Parch             0
Fare        78.2667
Embarked          C
Name: 496, dtype: object


In [31]:
plot_notebook(cp_xgb_2, selected_variables=["Pclass", "Sex", "Age", "Embarked"], width=900, height=1000, size=4)

In [32]:
from ceteris_paribus.select_data import select_neighbours

In [33]:
neighbours = select_neighbours(X_train, elizabeth, selected_variables=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], n=15)

In [34]:
cp_xgb_ns = individual_variable_profile(explainer_xgb, neighbours)

In [35]:
plot_notebook(cp_xgb_ns, color="Sex", selected_variables=["Pclass", "Age"],
              height=600, width=1000, 
              aggregate_profiles='mean', size_pdps=6, alpha_pdps=1, size=2)