# Explanatory Model Analysis - code snippets for Python

## 4.3 Models for RMS Titanic

In [None]:
import dalex as dx
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
    (OneHotEncoder(), ['gender', 'class', 'embarked']))

### 4.3.1  Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

titanic_lr = make_pipeline(
    preprocess,
    LogisticRegression(penalty = 'l2'))
titanic_lr.fit(X, y)

### 4.3.2 Random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

titanic_rf = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth = 3, n_estimators = 500))
titanic_rf.fit(X, y)

### 4.3.3 Gradient boosting model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

titanic_gbc = make_pipeline(
    preprocess,
    GradientBoostingClassifier(n_estimators = 100))
titanic_gbc.fit(X, y)

### 4.3.4 Support vector machine model

In [None]:
from sklearn.svm import SVC

titanic_svm = make_pipeline(
    preprocess,
    SVC(probability = True))
titanic_svm.fit(X, y)

### 4.3.5 Models’ predictions

In [None]:
import pandas as pd

johnny_d = pd.DataFrame({'gender': ['male'],
                       'age'     : [8],
                       'class'   : ['1st'],
                       'embarked': ['Southampton'],
                       'fare'    : [72],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['JohnnyD'])

In [None]:
print(f'''The predicted probability of survival for Johnny D
logistic regression model: {titanic_lr.predict_proba(johnny_d)}
random forest model: {titanic_rf.predict_proba(johnny_d)}
gradient boosting model: {titanic_gbc.predict_proba(johnny_d)}
support vector machine model: {titanic_svm.predict_proba(johnny_d)}''')

In [None]:
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])


print(f'''The predicted probability of survival for Henry
logistic regression model: {titanic_lr.predict_proba(henry)}
random forest model: {titanic_rf.predict_proba(henry)}
gradient boosting model: {titanic_gbc.predict_proba(henry)}
support vector machine model: {titanic_svm.predict_proba(henry)}''')

### 4.3.6 Models’ explainers

In [None]:
titanic_rf_exp = dx.Explainer(titanic_rf, 
                    X, y, label = "Titanic RF Pipeline")
titanic_lr_exp = dx.Explainer(titanic_lr, 
                    X, y, label = "Titanic LR Pipeline")
titanic_gbc_exp = dx.Explainer(titanic_gbc, 
                    X, y, label = "Titanic GBC Pipeline")
titanic_svm_exp = dx.Explainer(titanic_svm, 
                    X, y, label = "Titanic SVM Pipeline")

## 4.6 Models for apartment prices

In [None]:
import dalex as dx
apartments = dx.datasets.load_apartments()
X = apartments.drop(columns='m2_price')
y = apartments['m2_price']

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['construction_year', 'surface', 'floor', 'no_rooms']),
    (OneHotEncoder(), ['district']))

### 4.6.1  Linear regression model

In [None]:
from sklearn.linear_model import LinearRegression

apartments_lm = make_pipeline(
    preprocess,
    LinearRegression())
apartments_lm.fit(X, y)

### 4.6.2 Random forest model

In [None]:
from sklearn.ensemble import RandomForestRegressor

apartments_rf = make_pipeline(
    preprocess,
    RandomForestRegressor(max_depth = 3, n_estimators = 500))
apartments_rf.fit(X, y)

### 4.6.3 Support vector machine model

In [None]:
from sklearn.svm import SVR

apartments_svm = make_pipeline(
    preprocess,
    SVR())
apartments_svm.fit(X, y)

### Models' predictions

In [None]:
apartments_test = dx.datasets.load_apartments_test()
apartments_test = apartments_test.drop(columns='m2_price')

apartments_lm.predict(apartments_test)

In [None]:
apartments_rf.predict(apartments_test)

In [None]:
apartments_svm.predict(apartments_test)

### 4.6.5 Models’ explainers

In [None]:
apartments_lm_exp = dx.Explainer(apartments_lm, X, y, 
                      label = "Apartments LM Pipeline")
apartments_rf_exp = dx.Explainer(apartments_rf, X, y, 
                      label = "Apartments RF Pipeline")
apartments_svm_exp = dx.Explainer(apartments_svm, X, y, 
                      label = "Apartments SVM Pipeline")

## 6.7 Break-down Plots for Additive Attributions

In [None]:
import pandas as pd
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])

In [None]:
bd_henry = titanic_rf_exp.predict_parts(henry, 
             type = 'break_down')
bd_henry.result

In [None]:
bd_henry.plot()

In [None]:
import numpy as np

bd_henry = titanic_rf_exp.predict_parts(henry,
        type = 'break_down',
        order = np.array(['gender', 'class', 'age',
            'embarked', 'fare', 'sibsp', 'parch']))
bd_henry.plot(max_vars = 5)

## 7.6 Break-down Plots for Interactions

In [None]:
import pandas as pd
henry = pd.DataFrame({'gender': ['male'], 'age': [47],
           'class': ['1st'],
           'embarked': ['Cherbourg'], 'fare': [25],
           'sibsp': [0], 'parch': [0]},
           index = ['Henry'])

In [None]:
bd_henry = titanic_rf_exp.predict_parts(henry, 
                type = 'break_down_interactions', 
                interaction_preference = 10)
bd_henry.result

In [None]:
bd_henry.plot()

## 8.6 Shapley Additive Explanations (SHAP) for Average Attributions

In [None]:
import pandas as pd
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])

In [None]:
bd_henry = titanic_rf_exp.predict_parts(henry, type = 'shap')
bd_henry.result

In [None]:
bd_henry.plot()

## 9.7 Local Interpretable Model-agnostic Explanations (LIME)

In [None]:
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

X['gender']   = le.fit_transform(X['gender'])
X['class']    = le.fit_transform(X['class'])
X['embarked'] = le.fit_transform(X['embarked'])

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfc
titanic_fr = rfc()
titanic_fr.fit(X, y)

In [None]:
import pandas as pd
henry = pd.Series([1, 47.0, 0, 1, 25.0, 0, 0], 
                  index =['gender', 'age', 'class', 'embarked',
                          'fare', 'sibsp', 'parch'])

In [None]:
from lime.lime_tabular import LimeTabularExplainer 
explainer = LimeTabularExplainer(X, 
                      feature_names=X.columns, 
                      class_names=['died', 'survived'], 
                      discretize_continuous=False, 
                      verbose=True)

In [None]:
lime = explainer.explain_instance(henry, titanic_fr.predict_proba)
lime.show_in_notebook(show_table=True)

## 10.7 Ceteris-paribus Profiles

In [None]:
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])

In [None]:
cp_henry = titanic_rf_exp.predict_profile(henry)
cp_henry.result

In [None]:
cp_henry.plot(variables = ['age', 'fare'])

In [None]:
cp_henry.plot(variables = ['class', 'embarked'],
               variable_type = 'categorical')

In [None]:
cp_henry2 = titanic_lr_exp.predict_profile(henry)
cp_henry.plot(cp_henry2, variables = ['age', 'fare'])

## 15.7 Model-performance Measures 

In [None]:
mp_rf = titanic_rf_exp.model_performance(model_type = "classification", 
          cutoff = 0.5)
mp_rf.result

In [None]:
# back to original X 
X = titanic.drop(columns='survived')
y = titanic.survived

In [None]:
import plotly.express as px
from sklearn.metrics import roc_curve, auc
y_score = titanic_rf_exp.predict(X)
fpr, tpr, thresholds = roc_curve(y, y_score)
fig = px.area(x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

In [None]:
df = pd.DataFrame({'False Positive Rate': fpr,
        'True Positive Rate': tpr }, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"
fig_thresh = px.line(df, 
    title='TPR and FPR at every threshold', width=700, height=500)
fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

## 16.7 Variable-importance Measures

In [None]:
mp_rf = titanic_rf_exp.model_parts()
mp_rf.result

In [None]:
mp_rf.plot()

In [None]:
vi_grouped = titanic_rf_exp.model_parts(
                variable_groups={'personal': ['gender', 'age', 
                                              'sibsp', 'parch'],
                                   'wealth': ['class', 'fare']})
vi_grouped.result

In [None]:
vi_grouped.plot()

## 17.7 Partial-dependence Profiles

In [None]:
pd_rf = titanic_rf_exp.model_profile(variables = ['age', 'fare'])
pd_rf.result

In [None]:
pd_rf.plot()

In [None]:
pd_rf.plot(geom = 'profiles')

In [None]:
mp_rf = titanic_rf_exp.model_profile( variable_type = 'categorical')
mp_rf.plot(variables = ['gender', 'class'])

### 17.7.1 Grouped partial-dependence profiles

In [None]:
mp_rf = titanic_rf_exp.model_profile(groups = 'class', 
                                  variables = ['age', 'fare'])
mp_rf.plot()

### 17.7.2 Contrastive partial-dependence profiles

In [None]:
pdp_rf = titanic_rf_exp.model_profile()
pdp_lr = titanic_lr_exp.model_profile()

In [None]:
pdp_rf.plot(pdp_lr, variables = ['age', 'fare'])

## 18.7 Local-dependence and Accumulated-local Profiles

In [None]:
ld_rf = titanic_rf_exp.model_profile(type = 'conditional')
ld_rf.result['_label_'] = 'LD profiles'
ld_rf.result

In [None]:
ld_rf.plot(variables = ['age', 'fare'])

In [None]:
al_rf = titanic_rf_exp.model_profile(type = 'accumulated')
al_rf.result['_label_'] = 'AL profiles'

In [None]:
al_rf.plot(ld_rf, variables = ['age', 'fare'])

## 19.7 Residual-diagnostics Plots

In [None]:
md_rf = apartments_rf_exp.model_diagnostics()
md_rf.result

In [None]:
md_rf.plot()

In [None]:
md_rf.plot(variable = "ids", yvariable = "abs_residuals")

## 21 FIFA 19 

### 21.2.2 Data preparation

In [None]:
import dalex as dx
fifa = dx.datasets.load_fifa()

### 21.4.2 Model assembly

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import numpy as np

X = fifa.drop(["nationality", "overall", "potential", 
     "value_eur", "wage_eur"], axis = 1)
y = fifa['value_eur']
ylog = np.log(y)

X_train, X_test, ylog_train, ylog_test, y_train, y_test = \
     train_test_split(X, ylog, y, test_size = 0.25, random_state = 4)
gbm_model = LGBMRegressor()
gbm_model.fit(X_train, ylog_train, verbose = False)

In [None]:
def predict_function(model, data):
    return np.exp(model.predict(data))
    
fifa_gbm_exp = dx.Explainer(gbm_model, X_test, y_test, 
    predict_function = predict_function, label = 'gbm')

### 21.5.2 Model audit

In [None]:
fifa_md_gbm = fifa_gbm_exp.model_diagnostics()
fifa_md_gbm.plot(variable = "y", yvariable = "y_hat")

### 21.6.2 Model understanding (dataset-level explanations)

In [None]:
fifa_mp_gbm = fifa_gbm_exp.model_parts()
fifa_mp_gbm.plot(max_vars = 20)

In [None]:
fifa_mp_gbm = fifa_gbm_exp.model_profile()

fifa_mp_gbm.plot(variables = ['movement_reactions',
    'skill_ball_control', 'skill_dribbling', 'age'])

In [None]:
fifa_mp_gbm = fifa_gbm_exp.model_profile(type = 'accumulated')

fifa_mp_gbm.plot(variables = ['movement_reactions',
    'skill_ball_control', 'skill_dribbling', 'age'])

### 21.7.3 Instance-level explanations

In [None]:
cr7 = X.loc['Cristiano Ronaldo',]

In [None]:
fifa_pp_gbm = fifa_gbm_exp.predict_parts(cr7, type='break_down')
fifa_pp_gbm.plot(max_vars = 20)

In [None]:
fifa_mp_gbm = fifa_gbm_exp.predict_profile(cr7)

fifa_mp_gbm.plot(variables =  ['movement_reactions',
    'skill_ball_control', 'skill_dribbling', 'age'])