# "Amazon Employee Access" dataset

In [97]:
# data manipulation
from scipy.io import arff
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency


In [98]:
from sklearn.datasets import fetch_openml

data = fetch_openml(data_id=4135, parser='auto')

# The returned dataset is a Bunch object, similar to a dictionary
X = data['data']
y = data['target']

In [99]:
# Summary vectors creation

default_summary  = []
encoder_summary  = []
value_summary    = []
time_summary     = []
n_models_summary = []
card_9_summary   = []

#### Description
The data consists of real historical data collected from 2010 & 2011. Employees are manually allowed or denied access to resources over time. The data is used to create an algorithm capable of learning from this historical data to predict approval/denial for an unseen set of employees.

#### Attributes Information
- ACTION [target]: ACTION is 1 if the resource was approved, 0 if the resource was not
- RESOURCE: An ID for each resource
- MGR_ID: The EMPLOYEE ID of the manager of the current EMPLOYEE ID record; an employee may have only one manager at a time
- ROLE_ROLLUP_1: Company role grouping category id 1 (e.g. US Engineering)
- ROLE_ROLLUP_2: Company role grouping category id 2 (e.g. US Retail)
- ROLE_DEPTNAME: Company role department description (e.g. Retail)
- ROLE_TITLE: Company role business title description (e.g. Senior Engineering Retail Manager)
- ROLE_FAMILY_DESC: Company role family extended description (e.g. Retail Manager, Software Engineering)
- ROLE_FAMILY: Company role family description (e.g. Retail Manager)
- ROLE_CODE: Company role code; this code is unique to each role (e.g. Manager)

In [100]:
X.head()

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,42680,5905,117929,117930,119569,119323,123932,19793,119325


We check for duplicate rows.

In [101]:
X.duplicated().sum()

0

We eliminate the RESOURCE variable since its only function is to identify the observation, without providing any additional information.

In [102]:
X = X.drop('RESOURCE', axis=1)

In [103]:
X.dtypes

MGR_ID              category
ROLE_ROLLUP_1       category
ROLE_ROLLUP_2       category
ROLE_DEPTNAME       category
ROLE_TITLE          category
ROLE_FAMILY_DESC    category
ROLE_FAMILY         category
ROLE_CODE           category
dtype: object

In [104]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MGR_ID            32769 non-null  category
 1   ROLE_ROLLUP_1     32769 non-null  category
 2   ROLE_ROLLUP_2     32769 non-null  category
 3   ROLE_DEPTNAME     32769 non-null  category
 4   ROLE_TITLE        32769 non-null  category
 5   ROLE_FAMILY_DESC  32769 non-null  category
 6   ROLE_FAMILY       32769 non-null  category
 7   ROLE_CODE         32769 non-null  category
dtypes: category(8)
memory usage: 779.6 KB


In [105]:
X.shape

(32769, 8)

In [106]:
X.describe()

Unnamed: 0,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
count,32769,32769,32769,32769,32769,32769,32769,32769
unique,4243,128,177,449,343,2358,67,343
top,770,117961,118300,117878,118321,117906,290919,118322
freq,152,21407,4424,1135,4649,6896,10980,4649


## Variables

We can observe that the variables **ROLE_TITLE** and **ROLE_CODE** are completely correlated. We observe that for each value of **ROLE_TITLE**, there does not exist more than one value of **ROLE_CODE** and vice versa. Then, we confirm our statement with a chi-squared test to confirm the 'correlation' between the two variables.

In [107]:
ncode_forrole = X.groupby('ROLE_TITLE', as_index=False, observed=True)['ROLE_CODE'].nunique()
max_rolecode = np.max(ncode_forrole['ROLE_CODE'])
print(f'Max ROLE_CODE for each ROLE_TITLE: {max_rolecode}')

Max ROLE_CODE for each ROLE_TITLE: 1


In [108]:
ncode_forrole = X.groupby('ROLE_CODE', as_index=False, observed=True)['ROLE_TITLE'].nunique()
max_rolecode = np.max(ncode_forrole['ROLE_TITLE'])
print(f'Max ROLE_TITLE for each ROLE_CODE: {max_rolecode}')

Max ROLE_TITLE for each ROLE_CODE: 1


In [109]:
contingency_table = pd.crosstab(X['ROLE_TITLE'], X['ROLE_CODE'])

# Realizar la prueba chi-cuadrado
chi2, p_valor, _, _ = chi2_contingency(contingency_table)

print(f"chi-squared value: {chi2}")
print(f"p-value: {p_valor}")

chi-squared value: 11206998.000000002
p-value: 0.0


In [110]:
X = X.drop('ROLE_CODE', axis=1)

## Study of NA's

In [111]:
X.isna().sum().sort_values(ascending = False)

MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
dtype: int64

As can be seen, there are no np.nan in any of the variables.

## Type of Variables

We see which and how many variables contain only the values {"0", "1"}.

In [112]:
num_cols = X.select_dtypes(exclude=['object','category']).columns
binary_cols = [col for col in X.columns if X[col].isin([0, 1]).all()]
num_cols = [num_col for num_col in num_cols if num_col not in binary_cols]

print(f'Numeric columns: {len(num_cols)}')
print(f'Binary columns: {len(binary_cols)}')

Numeric columns: 0
Binary columns: 0


In [113]:
cat_columns = X.select_dtypes(include=['category']).columns
print(f'Category columns: {len(cat_columns)}')

Category columns: 7


General review of the values of all variables.

In [114]:
X[cat_columns].apply(lambda col: col.nunique()).sort_values(ascending=False)

MGR_ID              4243
ROLE_FAMILY_DESC    2358
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_ROLLUP_2        177
ROLE_ROLLUP_1        128
ROLE_FAMILY           67
dtype: int64

## Value counts of the variables with more cardinality

#### MGR_ID

In [115]:
X.MGR_ID.value_counts()[0:10]

MGR_ID
770      152
2270      99
2594      82
1350      71
2014      67
16850     66
3966      64
7807      64
5396      62
3526      62
Name: count, dtype: int64

#### ROLE_FAMILY_DESC

In [116]:
X.ROLE_FAMILY_DESC.value_counts()[0:10]

ROLE_FAMILY_DESC
117906    6896
240983    1244
117913     670
279443     665
117886     530
130134     419
117897     351
117879     333
168365     324
133686     321
Name: count, dtype: int64

#### ROLE_DEPTNAME        

In [117]:
X.ROLE_DEPTNAME.value_counts()[0:10]

ROLE_DEPTNAME
117878    1135
117941     763
117945     659
118514     601
117920     597
117884     546
119598     543
118403     532
119181     525
120722     501
Name: count, dtype: int64

## Response variable distribution

In [118]:
y.value_counts()

target
1    30872
0     1897
Name: count, dtype: int64

In [119]:
y.value_counts(normalize=True)

target
1    0.94211
0    0.05789
Name: proportion, dtype: float64

## Train-Test Split

In [120]:
from sklearn.model_selection import train_test_split

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size = 0.33, 
                                                    random_state = 42,
                                                    stratify = y)

In [122]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape:  {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape:  {y_test.shape}')

X_train shape: (21955, 7)
X_test shape:  (10814, 7)
y_train shape: (21955,)
y_test shape:  (10814,)


## Variables importance

### Mutual Info Classification

As sklearn's mutual_info_classif() needs a sparse matrix binarising the categorical variables, we will use the mutinformation() function of the infotheo package in R.

We read the file containing the mutual information of each variable with the response variable from the X_train.

##  Pipelines

All variables have a cardinality greater than 9, therefore we will apply all encoding methods to all variables.

In [123]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import balanced_accuracy_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingClassifier

import scipy.stats
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

import time

In [124]:
cat_cols = X.select_dtypes(include=['category']).columns.to_list()
cat_cols

['MGR_ID',
 'ROLE_ROLLUP_1',
 'ROLE_ROLLUP_2',
 'ROLE_DEPTNAME',
 'ROLE_TITLE',
 'ROLE_FAMILY_DESC',
 'ROLE_FAMILY']

In [125]:
# Define the HistGradientBoostingClassifier
hgb_default = HistGradientBoostingClassifier(max_iter=1000, random_state=1234,
                                             early_stopping=True,
                                             scoring='balanced_accuracy',
                                             validation_fraction=0.1,
                                             n_iter_no_change=5,
                                             class_weight='balanced')

# Define the hyperparameter search space
param_distributions = {
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
    'model__min_samples_leaf': scipy.stats.randint(1, 10),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

### One Hot Encoding + HistGradientBoosting

#### Preprocessing

In [126]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("one_hot", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [127]:
ohe_hgb_default_pipeline = Pipeline([("preprocessing",preprop_pipeline),
                                     ('model', hgb_default)])

In [128]:
tic = time.time()

ohe_hgb_default = ohe_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ohe_hgb_default_time_taken = toc-tic

In [129]:
# Display pipeline
print("Time taken: ", ohe_hgb_default_time_taken)
ohe_hgb_default

Time taken:  29.437156438827515


In [130]:
# Predict using the model with the best parameters
y_ohe_hgb_default_pred = ohe_hgb_default.predict(X_test)
ohe_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ohe_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_default_accuracy)
time_summary.append(ohe_hgb_default_time_taken)
n_models_summary.append(1)



Balanced accuracy with default parameters: 0.6795089380352253


#### Create a HistGradientBoostingClassifier model for tuning

In [131]:
ohe_hgb_tune = RandomizedSearchCV(estimator = ohe_hgb_default_pipeline, 
                                  param_distributions = param_distributions, 
                                  n_iter = 20,
                                  cv = stratified_kfold,
                                  scoring = 'balanced_accuracy', 
                                  random_state = 1234,
                                  n_jobs = -1)

In [132]:
tic = time.time()

ohe_hgb_tune = ohe_hgb_tune.fit(X_train, y_train)

toc = time.time()
ohe_hgb_tune_time_taken = toc-tic

In [133]:
# Display pipeline
print("Time taken: ", ohe_hgb_tune_time_taken)
ohe_hgb_tune

Time taken:  375.8606176376343


In [134]:
# Predict using the model with the best parameters
y_ohe_hgb_tune_pred = ohe_hgb_tune.predict(X_test)

# Get the best parameters
ohe_hgb_tune_best_params = ohe_hgb_tune.best_params_
print(f'Best parameters: {ohe_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ohe_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ohe_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_tune_accuracy)
time_summary.append(ohe_hgb_tune_time_taken)
n_models_summary.append(ohe_hgb_tune.n_iter * ohe_hgb_tune.n_splits_)



Best parameters: {'model__learning_rate': 0.27676783951084594, 'model__min_samples_leaf': 1}
Balanced accuracy with best parameters: 0.7680024485362094


### Count Encoder + HistGradientBoosting


In [135]:
from category_encoders.count import CountEncoder

#### Preprocessing

In [136]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", CountEncoder(handle_unknown = 0))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("count_encoder", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [137]:
count_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])

In [138]:
tic = time.time()

count_hgb_default = count_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
count_hgb_default_time_taken = toc-tic

In [139]:
# Display pipeline
print("Time taken: ", count_hgb_default_time_taken)
count_hgb_default

Time taken:  2.856032609939575


In [140]:
# Predict using the model with the best parameters
y_count_hgb_default_pred = count_hgb_default.predict(X_test)
count_hgb_default_accuracy = balanced_accuracy_score(y_test, y_count_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy)
time_summary.append(count_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.7344092404645697


#### Create a HistGradientBoostingClassifier model for tuning

In [141]:
count_hgb_tune = RandomizedSearchCV(estimator = count_hgb_default_pipeline, 
                                    param_distributions = param_distributions, 
                                    n_iter = 100,
                                    cv = stratified_kfold,
                                    scoring = 'balanced_accuracy', 
                                    random_state = 1234,
                                    n_jobs = -1)

In [142]:
tic = time.time()

count_hgb_tune = count_hgb_tune.fit(X_train, y_train)

toc = time.time()
count_hgb_tune_time_taken = toc-tic

In [143]:
# Display pipeline
print("Time taken: ", count_hgb_tune_time_taken)
count_hgb_tune

Time taken:  54.72720551490784


In [144]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred = count_hgb_tune.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params = count_hgb_tune.best_params_
print(f'Best parameters: {count_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_count_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy)
time_summary.append(count_hgb_tune_time_taken)
n_models_summary.append(count_hgb_tune.n_iter * count_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.3031724292767761, 'model__min_samples_leaf': 2}
Balanced accuracy with best parameters: 0.7515409973018436


### Ordinal Encoding + HistGradientBoosting

In [145]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [146]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("count_encoder", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [147]:
ordinal_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                         ('model', hgb_default)])

In [148]:
tic = time.time()

ordinal_hgb_default = ordinal_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default_time_taken = toc-tic

In [149]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default_time_taken)
ordinal_hgb_default

Time taken:  1.4021656513214111


In [150]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred = ordinal_hgb_default.predict(X_test)
ordinal_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy)
time_summary.append(ordinal_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.7266194269772996


#### Create a HistGradientBoostingClassifier model for tuning

In [151]:
ordinal_hgb_tune = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline, 
                                      param_distributions = param_distributions, 
                                      n_iter = 100,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

In [152]:
tic = time.time()

ordinal_hgb_tune = ordinal_hgb_tune.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune_time_taken = toc-tic

In [153]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune_time_taken)
ordinal_hgb_tune

Time taken:  55.066901445388794


In [154]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred = ordinal_hgb_tune.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params = ordinal_hgb_tune.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy)
time_summary.append(ordinal_hgb_tune_time_taken)
n_models_summary.append(ordinal_hgb_tune.n_iter * ordinal_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.2714269765146094, 'model__min_samples_leaf': 6}
Balanced accuracy with best parameters: 0.696611530698899


### Native HistGradientBoosting support for categorical variables


In [155]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [156]:
category_features_for_nativesupport = [col in cat_cols for col in X.columns]
category_features_for_nativesupport

[True, True, True, True, True, True, True]

In [157]:
hgb_default_categories_support = HistGradientBoostingClassifier(max_iter=1000, random_state=1234,
                                                                early_stopping=True,
                                                                scoring='balanced_accuracy',
                                                                validation_fraction=0.1,
                                                                n_iter_no_change=5,
                                                                categorical_features=category_features_for_nativesupport,
                                                                class_weight='balanced')

In [158]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999,
                               max_categories = 254))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("cat_cols", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [159]:
catsup_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', hgb_default_categories_support)])

In [160]:
tic = time.time()

catsup_hgb_default = catsup_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default_time_taken = toc-tic

In [161]:
# Display pipeline
print("Time taken: ", catsup_hgb_default_time_taken)
catsup_hgb_default

Time taken:  1.6177992820739746


In [162]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred = catsup_hgb_default.predict(X_test)
catsup_hgb_default_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy)
time_summary.append(catsup_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.7733998590084683


#### Create a HistGradientBoostingClassifier model for tuning

In [163]:
catsup_hgb_tune = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline, 
                                     param_distributions = param_distributions, 
                                     n_iter = 100,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

In [164]:
tic = time.time() 

catsup_hgb_tune = catsup_hgb_tune.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune_time_taken = toc-tic

In [165]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune_time_taken)
catsup_hgb_tune

Time taken:  49.593177795410156


In [166]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred = catsup_hgb_tune.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params = catsup_hgb_tune.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy)
time_summary.append(catsup_hgb_tune_time_taken)
n_models_summary.append(catsup_hgb_tune.n_iter * catsup_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.17144435060485483, 'model__min_samples_leaf': 2}
Balanced accuracy with best parameters: 0.7690222538324233


### Target Encoder + HistGradientBoosting


In [167]:
from sklearn.preprocessing import TargetEncoder

#### Preprocessing

In [168]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", TargetEncoder(random_state=1234))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("target_encoder", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [169]:
target_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', hgb_default)])

In [170]:
tic = time.time()

target_hgb_default = target_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
target_hgb_default_time_taken = toc-tic

In [171]:
# Display pipeline
print("Time taken: ", target_hgb_default_time_taken)
target_hgb_default

Time taken:  0.6138334274291992


In [172]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred = target_hgb_default.predict(X_test)
target_hgb_default_accuracy = balanced_accuracy_score(y_test, y_target_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy)
time_summary.append(target_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.7661671439556152


#### Create a HistGradientBoostingClassifier model for tuning

In [173]:
target_hgb_tune = RandomizedSearchCV(estimator = target_hgb_default_pipeline, 
                                    param_distributions = param_distributions, 
                                    n_iter = 100,
                                    cv = stratified_kfold,
                                    scoring = 'balanced_accuracy', 
                                    random_state = 1234,
                                    n_jobs = -1)

In [174]:
tic = time.time() 

target_hgb_tune = target_hgb_tune.fit(X_train, y_train)

toc = time.time()
target_hgb_tune_time_taken = toc-tic

In [175]:
# Display pipeline
print("Time taken: ", target_hgb_tune_time_taken)
target_hgb_tune

Time taken:  40.14692521095276


In [176]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred = target_hgb_tune.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params = target_hgb_tune.best_params_
print(f'Best parameters: {target_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_target_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy)
time_summary.append(target_hgb_tune_time_taken)
n_models_summary.append(target_hgb_tune.n_iter * target_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.13561744228416975, 'model__min_samples_leaf': 4}
Balanced accuracy with best parameters: 0.7688933670007063


### CatBoost

In [177]:
from catboost import CatBoostClassifier

#### Preprocessing

In [178]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("cat", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

In [179]:
category_features_for_catboostsupport = [index for index in range(len(cat_cols))]
print(category_features_for_catboostsupport)

[0, 1, 2, 3, 4, 5, 6]


Catboost allows to give a maximum value of unique categories for which a variable is encoded or not by One-Hot-Encoder.

In [180]:
# Create catboost models
catboost_default_raw = CatBoostClassifier(iterations=1000,
                                        eval_metric = 'BalancedAccuracy',
                                        loss_function = 'Logloss',
                                        auto_class_weights = 'Balanced',
                                        early_stopping_rounds=5,
                                        od_type='Iter',
                                        one_hot_max_size = 0,
                                        random_seed = 1234,
                                        verbose = False)

catboost_default_raw.set_params(cat_features=category_features_for_catboostsupport)

# Default CatBoostClassifier Pipeline
catboost_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                      ('model', catboost_default_raw)])

# Define the hyperparameter search space
catboost_param_distributions = {
    'model__iterations': scipy.stats.randint(10, 1000),
    'model__depth': scipy.stats.randint(4,11),
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

catboost_tune_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                       param_distributions = catboost_param_distributions, 
                                       n_iter = 5,
                                       cv = stratified_kfold,
                                       scoring = 'balanced_accuracy', 
                                       random_state = 1234,
                                       n_jobs = -1)

In [181]:
tic = time.time()

catboost_default = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken = toc-tic

In [182]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken)
catboost_default

Time taken:  52.58193826675415


In [183]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred = catboost_default.predict(X_test)
catboost_default_accuracy = balanced_accuracy_score(y_test, y_catboost_default_pred)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy)
time_summary.append(catboost_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.7874289240865968


In [184]:
tic = time.time()

catboost_tune = catboost_tune_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken = toc-tic

In [185]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken)
catboost_tune

Time taken:  214.8232388496399


In [186]:
# Predict using the model with the best parameters
y_catboost_tune_pred = catboost_tune.predict(X_test)

# Get the best parameters
catboost_tune_best_params = catboost_tune.best_params_
print(f'Best parameters: {catboost_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy = balanced_accuracy_score(y_test, y_catboost_tune_pred)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy)
time_summary.append(catboost_tune_time_taken)
n_models_summary.append(catboost_tune.n_iter * catboost_tune.n_splits_)

Best parameters: {'model__depth': 5, 'model__iterations': 289, 'model__learning_rate': 0.05519108968182859}
Balanced accuracy with best parameters: 0.7878285987022257


### Results Summary

In [187]:
results_summary = pd.DataFrame({"Dataset":"Amazon_employee_access",
                                "Variables":card_9_summary,
                                "Default/Tune":default_summary,
                                "Encoder":encoder_summary,
                                "Metric":"BalancedAccuracy",
                                "Value":value_summary,
                                "Time":time_summary,
                                "n_Models":n_models_summary})
results_summary["mean_Time"] = (results_summary["Time"] / results_summary["n_Models"])
results_summary

Unnamed: 0,Dataset,Variables,Default/Tune,Encoder,Metric,Value,Time,n_Models,mean_Time
0,Amazon_employee_access,AllVariables,Default,OneHotEncoding,BalancedAccuracy,0.679509,29.437156,1,29.437156
1,Amazon_employee_access,AllVariables,Tune,OneHotEncoding,BalancedAccuracy,0.768002,375.860618,60,6.264344
2,Amazon_employee_access,AllVariables,Default,CountEncoding,BalancedAccuracy,0.734409,2.856033,1,2.856033
3,Amazon_employee_access,AllVariables,Tune,CountEncoding,BalancedAccuracy,0.751541,54.727206,300,0.182424
4,Amazon_employee_access,AllVariables,Default,OrdinalEncoder,BalancedAccuracy,0.726619,1.402166,1,1.402166
5,Amazon_employee_access,AllVariables,Tune,OrdinalEncoder,BalancedAccuracy,0.696612,55.066901,300,0.183556
6,Amazon_employee_access,AllVariables,Default,HGB_NativeSupport,BalancedAccuracy,0.7734,1.617799,1,1.617799
7,Amazon_employee_access,AllVariables,Tune,HGB_NativeSupport,BalancedAccuracy,0.769022,49.593178,300,0.165311
8,Amazon_employee_access,AllVariables,Default,TargetEncoder,BalancedAccuracy,0.766167,0.613833,1,0.613833
9,Amazon_employee_access,AllVariables,Tune,TargetEncoder,BalancedAccuracy,0.768893,40.146925,300,0.133823


In [188]:
results_summary.to_csv("Amazon_Employee_Access_results.csv")