In [70]:
%load_ext autoreload
%autoreload 2

import eli5
from eli5.sklearn import PermutationImportance
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from ceteris_paribus.explainer import explain
from ceteris_paribus.profiles import individual_variable_profile
from ceteris_paribus.plots.plots import plot

from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
def plot_ceteris(data, colname):
    df = data.loc[data['_vname_'] == colname, :]
    plt.figure(figsize=(7, 7))
    plt.plot(df[colname], df['_yhat_'])
    plt.xlabel(colname)
    plt.ylabel('Model response')
    plt.title('Impact of ' + colname + ' on model response')
    filename = 'titanic_' + colname + '.png'
    plt.savefig(filename, dpi=300)
    plt.close()
    return filename

In [72]:
cols = [
    'Country',
    'Quality of Life Index',
    'Purchasing Power Index',
    'Safety Index',
    'Health Care Index',
    'Cost of Living Index',
    'Property Price to Income Ratio',
    'Traffic Commute Time Index',
    'Pollution Index',
    'Climate Index',
]

data = pd.read_csv('country_data.csv', '\t', header=None).set_index(0)
data.columns = cols
data = data.iloc[:, 1:]

In [73]:
y = data['Quality of Life Index']
X = data.loc[:, data.columns != 'Quality of Life Index']
model_country = XGBRegressor().fit(np.array(X), y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [74]:
perm = PermutationImportance(model_country, n_iter=3).fit(X, y)
eli5.show_weights(perm, feature_names=cols[2:])

Weight,Feature
0.4269  ± 0.0724,Pollution Index
0.3456  ± 0.0645,Purchasing Power Index
0.0400  ± 0.0201,Property Price to Income Ratio
0.0326  ± 0.0153,Safety Index
0.0147  ± 0.0037,Health Care Index
0.0089  ± 0.0018,Traffic Commute Time Index
0.0081  ± 0.0003,Climate Index
0.0067  ± 0.0021,Cost of Living Index


In [75]:
importances = abs(perm.feature_importances_)

In [76]:
explainer_xgb = explain(model_country, data=np.array(X), y=y, label='XGBoost', variable_names=cols[2:],
    predict_function=lambda X: model_country.predict(np.array(X)))

In [77]:
x_example = np.array(X)[10, :]
y_example = np.array(y)[10]
cp_xgb = individual_variable_profile(explainer_xgb, x_example, y_example)

In [78]:
df = cp_xgb.profile
filenames = []
for col in cols[2:]:
    filenames.append(plot_ceteris(df, col))

In [79]:
gen_picture(filenames, importances, 'countries.png', min_imp=0.0002, 
            width_pixels=3000, max_rect_size=0.4, min_rect_size=0.1, function=lambda x: np.sqrt(x))

In [80]:
col_imp = gen_importance_for_cols(data)
file_imp = dict()
for cols, imp in col_imp.items():
    file_imp[generate_plot(data, cols)] = imp

In [83]:
gen_picture(np.array(list(file_imp.keys())), np.array(list(file_imp.values())), 'test.png', min_imp=0.8, 
            width_pixels=3000, max_rect_size=0.4, min_rect_size=0.01)

In [45]:
data = pd.read_csv('titanic3.csv')
y = data['survived']
useless_coluns = ['name', 'survived', 'ticket', 'home.dest', 'cabin']
X = data.drop(useless_coluns, axis=1)
X = pd.get_dummies(X)
colnames = list(X.columns)
X = pd.DataFrame(SimpleImputer().fit_transform(X), columns=colnames)
# X.colnames = colnames

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
model = XGBClassifier()
model.fit(np.array(X_train), y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [47]:
preds = model.predict_proba(np.array(X_test))[:, 1]
print(roc_auc_score(y_test, preds))

0.973592091090937


In [48]:
perm = PermutationImportance(model, n_iter=3).fit(X, y)
eli5.show_weights(perm, feature_names=colnames)

Weight,Feature
0.1161  ± 0.0147,sex_female
0.0817  ± 0.0087,pclass
0.0443  ± 0.0062,boat_C
0.0425  ± 0.0094,boat_15
0.0413  ± 0.0043,boat_13
0.0171  ± 0.0044,boat_11
0.0153  ± 0.0012,boat_16
0.0130  ± 0.0012,boat_9
0.0122  ± 0.0033,boat_5
0.0115  ± 0.0022,boat_D


In [53]:
importances = abs(perm.feature_importances_)

In [50]:
explainer_xgb = explain(model, data=np.array(X), y=y, label='XGBoost', variable_names=colnames,
    predict_function=lambda X: model.predict_proba(np.array(X))[:, 1])

In [51]:
ernest = np.array(X_test)[10, :]
label_ernest = np.array(y_test)[10]
cp_xgb = individual_variable_profile(explainer_xgb, ernest, label_ernest)

In [52]:
import matplotlib.pyplot as plt
import numpy as np

df = cp_xgb.profile
filenames = []
for col in colnames:
    filenames.append(plot_ceteris(df, col))

In [69]:
gen_picture(filenames, importances, 'titanic2.png', min_imp=0.0001, number_of_plots=10, width_pixels=3000, max_rect_size=0.4,
            min_rect_size=0.1, function=lambda x: np.log(x))

> /home/aga/visard/packing_tests/utils.py(86)gen_picture()
-> packer, height = pack(sizes_sorted, width)
(Pdb) c
