In [None]:
#!pip install lime
#!pip install shap
#!pip install anchor-exp
#!pip install hyperopt

import pandas as pd
import numpy as np

import xgboost as xgb

from hyperopt import hp
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


import os
import joblib

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from matplotlib.pyplot import figure
import matplotlib.image as mpimg
import pylab as pl
from pylab import savefig
plt.style.use('seaborn-deep')

import stability as st

import statistics
import scipy as scp
import math

import lime
import lime.lime_tabular

import shap

from anchor import anchor_tabular

import time
import random

In [None]:
# path to project folder
# please change to your own
PATH = os.getcwd()

dataset = "income"
cls_method = "xgboost" 

classification = True

if dataset == "diabetes":
    class_var = "Outcome"
elif dataset == "breast_cancer":
    class_var = "diagnosis"

random_state = 39
num_eval = 500
n_splits = 3
random.seed(random_state)

save_to = "%s/%s/" % (PATH, dataset)
dataset_folder = "%s/datasets/" % (save_to)

In [None]:
#Get datasets
X_train = pd.read_csv(dataset_folder+dataset+"_Xtrain.csv", index_col=False, sep = ";")#.values
X_test = pd.read_csv(dataset_folder+dataset+"_Xtest.csv", index_col=False, sep = ";")#.values
X_validation = pd.read_csv(dataset_folder+dataset+"_Xvalidation.csv", index_col=False, sep = ";")#.values

y_train = pd.read_csv(dataset_folder+dataset+"_ytrain.csv", index_col=False, sep = ";").values.reshape(-1)
y_test = pd.read_csv(dataset_folder+dataset+"_ytest.csv", index_col=False, sep = ";").values.reshape(-1)
y_validation = pd.read_csv(dataset_folder+dataset+"_yvalidation.csv", index_col=False, sep = ";").values.reshape(-1)

feat_list = X_train.columns
results_template = pd.read_csv(os.path.join(dataset_folder, dataset+"_results_template.csv"), index_col=False)

In [None]:
results_template

In [None]:
if cls_method == "xgboost":
    space = {'learning_rate': [random.uniform(0,5) for i in range(5)],
            'subsample': [random.uniform(0.5,1) for i in range(5)],
            'max_depth': np.arange(1, 33, 6),
            'colsample_bytree': [random.uniform(0,1) for i in range(5)],
            'min_child_weight': np.arange(0,6,1)}
elif cls_method == "decision_tree":
    space = {"splitter": ["best", "random"],
            "min_samples_split": [random.uniform(0, 1) for i in range (50)],
            "max_features": [random.uniform(0,1) for i in range (50)]}


In [None]:
#Create prediction model
if classification == True:
    if cls_method == "xgboost":
        estimator = xgb.XGBClassifier(random_state = random_state)
    elif cls_method == "decision_tree":
        space["criterion"] = ["gini", "entropy"]
        estimator = DecisionTreeClassifier(random_state = random_state)
        
else:
    if cls_method == "xgboost":
        estimator = xgb.XGBRegressor(random_state = random_state)
    elif cls_method == "decision_tree":
        space["criterion"] = ["mse", "friedman_mse", "mae", "poisson"]
        estimator = DecisionTreeRegressor(random_state = random_state)
        
cls = GridSearchCV(estimator, param_grid=space, verbose = 3)
cls.fit(X_train.values, y_train)
cls = cls.best_estimator_
joblib.dump(cls, save_to+cls_method+"/cls.joblib")

In [None]:
test_x = pd.concat([X_test, X_validation])
test_y = np.hstack([y_test, y_validation])
y_pred = cls.predict(test_x.values)

if classification == True:
    print(classification_report(test_y, y_pred))
else:
    print("RMSE:", mean_squared_error(test_y, y_pred, squared = False))
    print("MAE:", mean_absolute_error(test_y, y_pred))
    print("MAPE:", mean_absolute_percentage_error(test_y, y_pred))

In [None]:
if classification:
    full_test = pd.concat([test_x.reset_index(), results_template], axis = 1, join = 'inner').drop(['index'], axis = 1)
    full_test["predicted"] = y_pred
    
    grouped = full_test.groupby('predicted')
    balanced = grouped.apply(lambda x: x.sample(grouped.size().min()).reset_index(drop=True))
    
    test_sample = balanced[X_test.columns]
    test_sample.reset_index(drop = True, inplace = True)
    
    results_template = balanced[results_template.columns]
    results_template.reset_index(drop = True, inplace = True)
    
    preds = cls.predict(test_sample.values)
    probas = [cls.predict_proba(test_sample.values)[i][preds[i]] for i in range(len(preds))]

    results_template["Prediction"] = preds
    results_template["Prediction Probability"] = probas

In [None]:
if classification == False:
    test_sample = test_x
    results_template = results_template
    
    preds = cls.predict(test_sample.values)
    results_template["Prediction"] = preds

In [None]:
results_template.to_csv(os.path.join(save_to, cls_method, "results.csv"), sep = ";", index = False)
test_sample.to_csv(os.path.join(save_to, cls_method, "test_sample.csv"), index = False)

In [None]:
results_template