In [1]:
#!pip install lime
#!pip install shap
#!pip install anchor-exp
#!pip install hyperopt

import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from hyperopt import hp

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler,MinMaxScaler

import os
import joblib

import warnings
warnings.filterwarnings(action = 'ignore')

import matplotlib.pyplot as plt

from matplotlib.pyplot import figure
import matplotlib.image as mpimg
import pylab as pl
from pylab import savefig
plt.style.use('seaborn-deep')

import stability as st

import statistics
import scipy as scp
import math

import lime
import lime.lime_tabular

import shap

from anchor import anchor_tabular

In [2]:
# path to project folder
# please change to your own
PATH = os.getcwd()

dataset = "bike_sharing"
balanced = True

random_state = 39

dataset_folder = "%s/%s/" % (PATH, dataset)

In [3]:
#Load and process data
dataset_name = "%s.csv" % (dataset)
dataset_path = dataset_folder + "/datasets/" + dataset_name
data = pd.read_csv( dataset_path )

# features
class_cols = {"diabetes": "Outcome", "breast_cancer": "diagnosis", "income": "income",
              "housing": "MEDV", "student_scores": "G3", "bike_sharing": "total_rental"}

class_var = class_cols[dataset]

rem_cols = {"diabetes": [class_var], "breast_cancer": [class_var, "id"], "bike_sharing": [class_var],
           "housing": [class_var], "student_scores": [class_var], "income": [class_var, "fnlwgt"]}

drop_cols = rem_cols[dataset]
    
feature_names = data.drop(drop_cols, axis=1).columns.to_list()

# balance dataset
if balanced == False:
    classes = data[class_var]
    neg_cases = data[data[class_var] == 0]
    pos_cases = data[data[class_var] == 1]

    if len(neg_cases) > len(pos_cases):
        neg_cases = neg_cases.sample(n=len(pos_cases), random_state = random_state)
    elif len(pos_cases) > len(neg_cases):
        pos_cases = pos_cases.sample(n=len(neg_cases), random_state = random_state)

    balanced_data = [neg_cases, pos_cases]
    balanced_data = pd.concat(balanced_data)

    # check how balanced the classes are
    balanced_data.groupby(class_var).count()
    
else:
    balanced_data = data

In [4]:
X = balanced_data[ feature_names ]#.values
Y = balanced_data[class_var]#.values


#generate training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=515)
X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=515)


X_train.to_csv(dataset_path.replace(".csv", "") + "_Xtrain.csv", sep=";", index = False)
X_test.to_csv(dataset_path.replace(".csv", "") + "_Xtest.csv", sep=";", index = False)
X_validation.to_csv(dataset_path.replace(".csv", "") + "_Xvalidation.csv", sep=";", index = False)
y_train.to_csv(dataset_path.replace(".csv", "") + "_Ytrain.csv", sep=";", index = False)
y_test.to_csv(dataset_path.replace(".csv", "") + "_Ytest.csv", sep=";", index = False)
y_validation.to_csv(dataset_path.replace(".csv", "") + "_Yvalidation.csv", sep=";", index = False)

In [5]:
results_template_test = pd.DataFrame(y_test)
results_template_validation = pd.DataFrame(y_validation)
results_template = pd.concat([results_template_test, results_template_validation])
results_template = results_template.rename(columns = {class_var: "Actual"})

results_template.to_csv(dataset_path.replace(".csv", "") + "_results_template.csv", sep=";", index = False)