## Building a good explainability pipeline

There is an eye-watering number of explainer-dashboard libraries. Sometimes it's good to have a manually-built base as a standard for good practice and then applying various possible dashboards for any additional insights.


In [None]:
# catboost performed best on the base dataset. 

In [None]:
import numpy as np 
import seaborn as sns 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from yellowbrick.regressor import residuals_plot, prediction_error
from fast_ml.model_development import train_valid_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from tpot import TPOTRegressor
import category_encoders as ce
import time
from xgboost import XGBRegressor
import warnings
encoders = {

    'BackwardDifferenceEncoder': ce.backward_difference.BackwardDifferenceEncoder,
    'OneHotEncoder': ce.one_hot.OneHotEncoder,
    
}


In [None]:
clothing_data_df = pd.read_csv('./data/e-shop data and description/e-shop clothing 2008.csv',sep=',').drop(['Unnamed: 0'],axis=1)

In [None]:
max_levels = round(clothing_data_df.columns.shape[0]*0.5)
categorical_features = clothing_data_df.select_dtypes(exclude=[np.number]).columns
cats_many = []
cats_few = []
for ft in categorical_features:
    levels = clothing_data_df[ft].unique().shape[0]
    if levels > max_levels:
        cats_many.append(ft)
    else:
        cats_few.append(ft)
numeric_features = clothing_data_df.select_dtypes([np.number]).drop(['price'], axis=1).columns
numeric_features

categorical_features = clothing_data_df.select_dtypes(exclude=[np.number]).columns
categorical_features

X = clothing_data_df.drop('price', axis=1)
y = clothing_data_df['price']

y = np.log(y)

X_train, X_test_tmp, y_train, y_test_tmp = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test_tmp, y_test_tmp, test_size=0.5)

del X_test_tmp, y_test_tmp

selected_model = XGBRegressor(tree_method = "gpu_hist",single_precision_histogram=True, gpu_id=0)


categorical_transformer_many_level = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', encoders['BackwardDifferenceEncoder']())
    ]
)    

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', encoders['OneHotEncoder']())
    ]
) 

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical_many', categorical_transformer_many_level, cats_many),
        ('categorical', categorical_transformer, cats_few)
    ]
)

preprocessor.fit(X_train, y_train)
X_train_prc = preprocessor.transform(X_train)
X_val_prc = preprocessor.transform(X_val)
X_test_prc = preprocessor.transform(X_test)
evalset = [(X_train_prc, y_train), (X_val_prc,y_val)]
selected_model.fit(X_train_prc, y_train, eval_metric='rmse', eval_set=evalset, verbose=0)

In [None]:
# saving the catboost model as it is by far the best performing
os.chdir('./artifacts')
ctb_reg.save_model('ctb_reg_clickstream.cbm')