# "Churn" Dataset

In [1]:
# data manipulation
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import fetch_openml

data = fetch_openml(data_id=40701, parser='auto')

# The returned dataset is a Bunch object, similar to a dictionary
X = data['data']
y = data['target']

In [3]:
# Summary vectors creation

default_summary  = []
encoder_summary  = []
value_summary    = []
time_summary     = []
n_models_summary = []
card_9_summary   = []

#### Description
A dataset relating characteristics of telephony account features and usage and whether or not the customer churned.

#### Exploratory Data Analysis

In [4]:
X.head()

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,16,128,415,2845,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,35,107,415,2301,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,31,137,415,1616,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,35,84,408,2510,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,36,75,415,155,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [5]:
X.shape

(5000, 20)

In [6]:
X.dtypes

state                               int64
account_length                      int64
area_code                        category
phone_number                        int64
international_plan               category
voice_mail_plan                  category
number_vmail_messages               int64
total_day_minutes                 float64
total_day_calls                     int64
total_day_charge                  float64
total_eve_minutes                 float64
total_eve_calls                     int64
total_eve_charge                  float64
total_night_minutes               float64
total_night_calls                   int64
total_night_charge                float64
total_intl_minutes                float64
total_intl_calls                    int64
total_intl_charge                 float64
number_customer_service_calls    category
dtype: object

In [7]:
X.loc[:,'state'] = X['state'].astype(str)
X.loc[:,'international_plan'] = X['international_plan'].astype(float)
X.loc[:,'voice_mail_plan'] = X['voice_mail_plan'].astype(float)

In [8]:
X.nunique()

state                              51
account_length                    218
area_code                           3
phone_number                     5000
international_plan                  2
voice_mail_plan                     2
number_vmail_messages              48
total_day_minutes                1961
total_day_calls                   123
total_day_charge                 1961
total_eve_minutes                1879
total_eve_calls                   126
total_eve_charge                 1659
total_night_minutes              1853
total_night_calls                 131
total_night_charge               1028
total_intl_minutes                170
total_intl_calls                   21
total_intl_charge                 170
number_customer_service_calls      10
dtype: int64

In [9]:
X.describe()

Unnamed: 0,account_length,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,100.2586,2499.5,0.0946,0.2646,7.7552,180.2889,100.0294,30.649668,200.63656,100.191,17.054322,200.39162,99.9192,9.017732,10.26178,4.4352,2.771196
std,39.69456,1443.520003,0.292691,0.441164,13.546393,53.894699,19.831197,9.162069,50.551309,19.826496,4.296843,50.527789,19.958686,2.273763,2.761396,2.456788,0.745514
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,73.0,1249.75,0.0,0.0,0.0,143.7,87.0,24.43,166.375,87.0,14.14,166.9,87.0,7.51,8.5,3.0,2.3
50%,100.0,2499.5,0.0,0.0,0.0,180.1,100.0,30.62,201.0,100.0,17.09,200.4,100.0,9.02,10.3,4.0,2.78
75%,127.0,3749.25,0.0,1.0,17.0,216.2,113.0,36.75,234.1,114.0,19.9,234.7,113.0,10.56,12.0,6.0,3.24
max,243.0,4999.0,1.0,1.0,52.0,351.5,165.0,59.76,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4


We check for duplicate rows.

In [10]:
X.duplicated().sum()

0

In [11]:
X.columns

Index(['state', 'account_length', 'area_code', 'phone_number',
       'international_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls'],
      dtype='object')

### Variables

The **phone_number** variable serves as an identifier for each observation, but cannot be used as a predictor variable.

In [12]:
print(f'Number of rows: {len(X)}')
print(f'Number of unique phone numbers: {X.phone_number.nunique()}')

Number of rows: 5000
Number of unique phone numbers: 5000


In [13]:
X = X.drop('phone_number', axis=1)

## Study of NA's

In [14]:
X.isna().sum().sort_values(ascending = False)

state                            0
total_eve_calls                  0
total_intl_charge                0
total_intl_calls                 0
total_intl_minutes               0
total_night_charge               0
total_night_calls                0
total_night_minutes              0
total_eve_charge                 0
total_eve_minutes                0
account_length                   0
total_day_charge                 0
total_day_calls                  0
total_day_minutes                0
number_vmail_messages            0
voice_mail_plan                  0
international_plan               0
area_code                        0
number_customer_service_calls    0
dtype: int64

As can be seen, there are no np.nan in any of the variables.

## Type of Variables

Check the numeric features.

In [15]:
num_cols = X.select_dtypes(exclude=['object','category']).columns
binary_cols = [col for col in X.columns if X[col].isin([0, 1]).all()]
num_cols = [num_col for num_col in num_cols if num_col not in binary_cols]

print(f'Numeric columns: {len(num_cols)}')
print(f'Binary columns: {len(binary_cols)}')

Numeric columns: 14
Binary columns: 2


Check the categorical features.

In [16]:
cat_columns = X.select_dtypes(include=['category','object']).columns
print(f'Category columns: {len(cat_columns)}')

Category columns: 3


General review of the values of all variables.

In [17]:
X[cat_columns].apply(lambda col: col.nunique()).sort_values(ascending=False)

state                            51
number_customer_service_calls    10
area_code                         3
dtype: int64

## Value counts of the variables with more cardinality

#### state

In [18]:
X['state'].value_counts()[0:10]

state
49    158
23    125
1     124
13    119
45    118
35    116
43    116
50    115
34    114
37    114
Name: count, dtype: int64

#### number_customer_service_calls

In [19]:
X["number_customer_service_calls"].value_counts()[0:10]

number_customer_service_calls
1    1786
2    1127
0    1023
3     665
4     252
5      96
6      34
7      13
8       2
9       2
Name: count, dtype: int64

#### area_code

In [20]:
X["area_code"].value_counts()[0:10]

area_code
415    2495
408    1259
510    1246
Name: count, dtype: int64

## Response variable distribution

In [21]:
y.value_counts()

class
0    4293
1     707
Name: count, dtype: int64

In [22]:
y.value_counts(normalize=True)

class
0    0.8586
1    0.1414
Name: proportion, dtype: float64

## Train-Test Split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size = 0.33, 
                                                    random_state = 42,
                                                    stratify = y)

In [25]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape:  {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape:  {y_test.shape}')

X_train shape: (3350, 19)
X_test shape:  (1650, 19)
y_train shape: (3350,)
y_test shape:  (1650,)


## Pipelines 

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.metrics import balanced_accuracy_score

import scipy.stats
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

import time

In [27]:
num_cols = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()
cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.to_list()

cat_cols_less9 = [col for col in cat_cols if X[col].nunique() <= 9]
cat_cols_more9 = [col for col in cat_cols if X[col].nunique() > 9]

In [28]:
# Define the HistGradientBoostingClassifier models
hgb_default = HistGradientBoostingClassifier(random_state=1234,
                                             early_stopping=False,
                                             min_samples_leaf=30,
                                             scoring='balanced_accuracy',
                                             class_weight='balanced')

hgb_default9 = HistGradientBoostingClassifier(random_state=1234,
                                              early_stopping=False,
                                              min_samples_leaf=30,
                                              scoring='balanced_accuracy',
                                              class_weight='balanced')

# Define the hyperparameter search space
param_distributions = {
    'model__max_iter': scipy.stats.randint(10,300),
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
    'model__min_samples_leaf': scipy.stats.randint(10, 50),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

### One Hot Encoding + HistGradientBoosting

#### Preprocessing

In [31]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [30]:
ohe_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                     ('model', hgb_default)])

In [31]:
tic = time.time()

ohe_hgb_default = ohe_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ohe_hgb_default_time_taken = toc-tic

In [32]:
# Display pipeline
print("Time taken: ", ohe_hgb_default_time_taken)
ohe_hgb_default

Time taken:  1.3159816265106201


In [33]:
# # Calculate balanced accuracy for the model with default parameters
y_ohe_hgb_default_pred = ohe_hgb_default.predict(X_test)
ohe_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ohe_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_default_accuracy)
time_summary.append(ohe_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8874685380768776


#### Create a HistGradientBoostingClassifier model for tuning

In [34]:
ohe_hgb_tune = RandomizedSearchCV(estimator = ohe_hgb_default_pipeline, 
                                  param_distributions = param_distributions, 
                                  n_iter =100,
                                  cv = stratified_kfold,
                                  scoring = 'balanced_accuracy', 
                                  random_state = 1234,
                                  n_jobs = -1)

In [35]:
tic = time.time()

ohe_hgb_tune = ohe_hgb_tune.fit(X_train, y_train)

toc = time.time()
ohe_hgb_tune_time_taken = toc-tic

In [36]:
# Display pipeline
print("Time taken: ", ohe_hgb_tune_time_taken)
ohe_hgb_tune

Time taken:  95.50823664665222


In [37]:
# Get the best parameters
ohe_hgb_tune_best_params = ohe_hgb_tune.best_params_
print(f'Best parameters: {ohe_hgb_tune_best_params}')

# Predict using the model with the best parameters
y_ohe_hgb_tune_pred = ohe_hgb_tune.predict(X_test)
ohe_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ohe_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_tune_accuracy)
time_summary.append(ohe_hgb_tune_time_taken)
n_models_summary.append(ohe_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.25048077955082804, 'model__max_iter': 11, 'model__min_samples_leaf': 12}
Balanced accuracy with best parameters: 0.8890238398841778


### Count Encoder + HistGradientBoosting


In [38]:
from category_encoders.count import CountEncoder

#### Preprocessing

In [39]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", CountEncoder())
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("count_encoder", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("count_encoder", cat_pipeline_more9, cat_cols_more9)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [40]:
count_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])
count_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                        ('model', hgb_default9)])

In [41]:
tic = time.time()

count_hgb_default = count_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
count_hgb_default_time_taken = toc-tic

In [42]:
# Display pipeline
print("Time taken: ", count_hgb_default_time_taken)
count_hgb_default

Time taken:  0.6058032512664795


In [43]:
tic = time.time()

count_hgb_default9 = count_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
count_hgb_default9_time_taken = toc-tic

In [44]:
# Display pipeline
print("Time taken: ", count_hgb_default9_time_taken)
count_hgb_default9

Time taken:  0.650198221206665


In [45]:
# Calculate balanced accuracy for the model with default parameters
y_count_hgb_default_pred = count_hgb_default.predict(X_test)
count_hgb_default_accuracy = balanced_accuracy_score(y_test, y_count_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy)
time_summary.append(count_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.9046359200511266


In [46]:
# Calculate balanced accuracy for the model with default parameters
y_count_hgb_default_pred9 = count_hgb_default9.predict(X_test)
count_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_count_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy9)
time_summary.append(count_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.9046359200511266


#### Create a HistGradientBoostingClassifier model for tuning

In [47]:
count_hgb_tune = RandomizedSearchCV(estimator = count_hgb_default_pipeline, 
                                   param_distributions = param_distributions, 
                                   n_iter = 100,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

count_hgb_tune9 = RandomizedSearchCV(estimator = count_hgb_default_pipeline9, 
                                     param_distributions = param_distributions, 
                                     n_iter = 100,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

In [48]:
tic = time.time()

count_hgb_tune = count_hgb_tune.fit(X_train, y_train)

toc = time.time()
count_hgb_tune_time_taken = toc-tic

In [49]:
# Display pipeline
print("Time taken: ", count_hgb_tune_time_taken)
count_hgb_tune

Time taken:  25.01795244216919


In [50]:
tic = time.time()

count_hgb_tune9 = count_hgb_tune9.fit(X_train, y_train)

toc = time.time()
count_hgb_tune9_time_taken = toc-tic

In [51]:
# Display pipeline
print("Time taken: ", count_hgb_tune9_time_taken)
count_hgb_tune9

Time taken:  21.505852937698364


In [52]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred = count_hgb_tune.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params = count_hgb_tune.best_params_
print(f'Best parameters: {count_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_count_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy)
time_summary.append(count_hgb_tune_time_taken)
n_models_summary.append(count_hgb_tune.n_iter * count_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.05020636179965285, 'model__max_iter': 51, 'model__min_samples_leaf': 29}
Balanced accuracy with best parameters: 0.8990477373160368


In [53]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred9 = count_hgb_tune9.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params9 = count_hgb_tune9.best_params_
print(f'Best parameters: {count_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_count_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy9)
time_summary.append(count_hgb_tune9_time_taken)
n_models_summary.append(count_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.05020636179965285, 'model__max_iter': 51, 'model__min_samples_leaf': 29}
Balanced accuracy with best parameters: 0.8990477373160368


### Ordinal Encoding + HistGradientBoosting


In [54]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [55]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999))
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("ordinal_encoder", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("ordinalencoder", cat_pipeline_more9, cat_cols_more9)
                   ],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [56]:
ordinal_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                         ('model', hgb_default)])

ordinal_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                          ('model', hgb_default9)])

In [57]:
tic = time.time()

ordinal_hgb_default = ordinal_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default_time_taken = toc-tic

In [58]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default_time_taken)
ordinal_hgb_default

Time taken:  0.5798184871673584


In [59]:
tic = time.time()

ordinal_hgb_default9 = ordinal_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default9_time_taken = toc-tic

In [60]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default9_time_taken)
ordinal_hgb_default9

Time taken:  0.6227858066558838


In [61]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred = ordinal_hgb_default.predict(X_test)
ordinal_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy)
time_summary.append(ordinal_hgb_default_time_taken)
n_models_summary.append(1)


Balanced accuracy with default parameters: 0.9017555071616575


In [62]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred9 = ordinal_hgb_default9.predict(X_test)
ordinal_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy9)
time_summary.append(ordinal_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.9114265464424932


#### Create a HistGradientBoostingClassifier model for tuning

In [63]:
ordinal_hgb_tune = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline, 
                                      param_distributions = param_distributions, 
                                      n_iter = 100,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

ordinal_hgb_tune9 = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline9, 
                                       param_distributions = param_distributions, 
                                       n_iter = 100,
                                       cv = stratified_kfold,
                                       scoring = 'balanced_accuracy', 
                                       random_state = 1234,
                                       n_jobs = -1)

In [64]:
tic = time.time()

ordinal_hgb_tune = ordinal_hgb_tune.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune_time_taken = toc-tic

In [65]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune_time_taken)
ordinal_hgb_tune

Time taken:  21.22732901573181


In [66]:
tic = time.time()

ordinal_hgb_tune9 = ordinal_hgb_tune9.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune9_time_taken = toc-tic

In [67]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune9_time_taken)
ordinal_hgb_tune9

Time taken:  20.529820442199707


In [68]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred = ordinal_hgb_tune.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params = ordinal_hgb_tune.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy)
time_summary.append(ordinal_hgb_tune_time_taken)
n_models_summary.append(ordinal_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.03505810211332009, 'model__max_iter': 113, 'model__min_samples_leaf': 33}
Balanced accuracy with best parameters: 0.9168920617516909


In [69]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred9 = ordinal_hgb_tune9.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params9 = ordinal_hgb_tune9.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy9)
time_summary.append(ordinal_hgb_tune9_time_taken)
n_models_summary.append(ordinal_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.165443649776894, 'model__max_iter': 40, 'model__min_samples_leaf': 24}
Balanced accuracy with best parameters: 0.9032532612876748


### Native HistGradientBoosting support for categorical variables


In [70]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [71]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999,
                               max_categories = 254))
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("ordinal_encoding", cat_pipeline_more9, cat_cols_more9)
                    ],
    sparse_threshold=0
)

In [72]:
category_features_for_nativesupport = [False]*len(num_cols) + [True]*len(cat_cols)

X_train_check = preprop_pipeline9.fit_transform(X_train)
category_features_for_nativesupport_9 = [False]*(X_train_check.shape[1]-len(cat_cols_more9)) + [True]*len(cat_cols_more9)

In [73]:
hgb_default_categories_support = HistGradientBoostingClassifier(random_state=1234,
                                                                early_stopping=False,
                                                                min_samples_leaf=30,
                                                                scoring='balanced_accuracy',
                                                                class_weight='balanced',
                                                                categorical_features=category_features_for_nativesupport)

hgb_default_categories_support9 = HistGradientBoostingClassifier(random_state=1234,
                                                                 early_stopping=False,
                                                                 min_samples_leaf=30,
                                                                 scoring='balanced_accuracy',
                                                                 class_weight='balanced',
                                                                 categorical_features=category_features_for_nativesupport_9)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [74]:
catsup_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', hgb_default_categories_support)])

catsup_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                        ('model', hgb_default_categories_support9)])

In [75]:
tic = time.time()

catsup_hgb_default = catsup_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default_time_taken = toc-tic

In [76]:
# Display pipeline
print("Time taken: ", catsup_hgb_default_time_taken)
catsup_hgb_default

Time taken:  0.6746971607208252


In [77]:
tic = time.time()

catsup_hgb_default9 = catsup_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default9_time_taken = toc-tic

In [78]:
# Display pipeline
print("Time taken: ", catsup_hgb_default9_time_taken)
catsup_hgb_default9

Time taken:  0.6633844375610352


In [79]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred = catsup_hgb_default.predict(X_test)
catsup_hgb_default_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy)
time_summary.append(catsup_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8932005900151745


In [80]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred9 = catsup_hgb_default9.predict(X_test)
catsup_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy9)
time_summary.append(catsup_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8989038681128299


#### Create a HistGradientBoostingClassifier model for tuning

In [81]:
catsup_hgb_tune = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline, 
                                   param_distributions = param_distributions, 
                                   n_iter = 100,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

catsup_hgb_tune9 = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline9, 
                                   param_distributions = param_distributions, 
                                   n_iter = 100,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

In [82]:
tic = time.time() 

catsup_hgb_tune = catsup_hgb_tune.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune_time_taken = toc-tic

In [83]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune_time_taken)
catsup_hgb_tune

Time taken:  22.051368951797485


In [84]:
tic = time.time() 

catsup_hgb_tune9 = catsup_hgb_tune9.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune9_time_taken = toc-tic

In [85]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune9_time_taken)
catsup_hgb_tune9

Time taken:  21.516291618347168


In [86]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred = catsup_hgb_tune.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params = catsup_hgb_tune.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy)
time_summary.append(catsup_hgb_tune_time_taken)
n_models_summary.append(catsup_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.07630670424747499, 'model__max_iter': 45, 'model__min_samples_leaf': 30}
Balanced accuracy with best parameters: 0.9047510154136922


In [87]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred9 = catsup_hgb_tune9.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params9 = catsup_hgb_tune9.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy9)
time_summary.append(catsup_hgb_tune9_time_taken)
n_models_summary.append(catsup_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.06745583511366768, 'model__max_iter': 63, 'model__min_samples_leaf': 22}
Balanced accuracy with best parameters: 0.9040165252710042


### Target Encoder (scikit-learn)

In [88]:
from sklearn.preprocessing import TargetEncoder

#### Preprocessing

In [89]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", TargetEncoder())
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("target_encoding", cat_pipeline_more9, cat_cols_more9)
                    ],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [90]:
target_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])

target_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                       ('model', hgb_default9)])

In [91]:
tic = time.time()

target_hgb_default = target_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
target_hgb_default_time_taken = toc-tic

In [92]:
# Display pipeline
print("Time taken: ", target_hgb_default_time_taken)
target_hgb_default

Time taken:  0.586390495300293


In [93]:
tic = time.time()

target_hgb_default9 = target_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
target_hgb_default9_time_taken = toc-tic

In [94]:
# Display pipeline
print("Time taken: ", target_hgb_default9_time_taken)
target_hgb_default9

Time taken:  0.6000323295593262


In [95]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred = target_hgb_default.predict(X_test)
target_hgb_default_accuracy = balanced_accuracy_score(y_test, y_target_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy)
time_summary.append(target_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.9057232683448377


In [96]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred9 = target_hgb_default9.predict(X_test)
target_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_target_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy9)
time_summary.append(target_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8989038681128299


#### Create a HistGradientBoostingClassifier model for tuning

In [97]:
target_hgb_tune = RandomizedSearchCV(estimator = target_hgb_default_pipeline, 
                                     param_distributions = param_distributions, 
                                     n_iter = 100,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

target_hgb_tune9 = RandomizedSearchCV(estimator = target_hgb_default_pipeline9, 
                                      param_distributions = param_distributions, 
                                      n_iter = 100,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

In [98]:
tic = time.time() 

target_hgb_tune = target_hgb_tune.fit(X_train, y_train)

toc = time.time()
target_hgb_tune_time_taken = toc-tic

In [99]:
# Display pipeline
print("Time taken: ", target_hgb_tune_time_taken)
target_hgb_tune

Time taken:  21.11894726753235


In [100]:
tic = time.time() 

target_hgb_tune9 = target_hgb_tune9.fit(X_train, y_train)

toc = time.time()
target_hgb_tune9_time_taken = toc-tic

In [101]:
# Display pipeline
print("Time taken: ", target_hgb_tune9_time_taken)
target_hgb_tune9

Time taken:  20.743098258972168


In [102]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred = target_hgb_tune.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params = target_hgb_tune.best_params_
print(f'Best parameters: {target_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_target_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy)
time_summary.append(target_hgb_tune_time_taken)
n_models_summary.append(target_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.07469021889651874, 'model__max_iter': 73, 'model__min_samples_leaf': 27}
Balanced accuracy with best parameters: 0.9029004031366514


In [103]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred9 = target_hgb_tune9.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params9 = target_hgb_tune9.best_params_
print(f'Best parameters: {target_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_target_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy9)
time_summary.append(target_hgb_tune9_time_taken)
n_models_summary.append(target_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.07630670424747499, 'model__max_iter': 45, 'model__min_samples_leaf': 30}
Balanced accuracy with best parameters: 0.909748577209301


### CatBoost

In [32]:
from catboost import CatBoostClassifier

#### Preprocessing

In [33]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat_less9", cat_pipeline, cat_cols_less9),
                    ("cat_more9", cat_pipeline, cat_cols_more9)],
    sparse_threshold=0
)

In [34]:
category_features_for_catboostsupport = [index for index in range(len(num_cols), len(num_cols) + len(cat_cols))]
category_features_for_catboostsupport9 = [index for index in range(len(num_cols) + len(cat_cols_less9), len(num_cols) + len(cat_cols))]
print(category_features_for_catboostsupport)
print(category_features_for_catboostsupport9)

[16, 17, 18]
[17, 18]


Catboost allows to give a maximum value of unique categories for which a variable is encoded or not by One-Hot-Encoder.

In [35]:
# Create catboost models
catboost_default_raw = CatBoostClassifier(iterations=100,
                                          eval_metric = 'BalancedAccuracy',
                                          loss_function = 'Logloss',
                                          auto_class_weights = 'Balanced',
                                          od_type='Iter',
                                          one_hot_max_size = 0,
                                          random_seed = 1234,
                                          min_data_in_leaf = 30,
                                          cat_features=category_features_for_catboostsupport,
                                          verbose = False)

catboost_default9_raw = CatBoostClassifier(iterations=100,
                                           eval_metric = 'BalancedAccuracy',
                                           loss_function = 'Logloss',
                                           auto_class_weights = 'Balanced',
                                           od_type='Iter',
                                           one_hot_max_size = 9,
                                           random_seed = 1234,
                                           min_data_in_leaf = 30,
                                           cat_features=category_features_for_catboostsupport9,
                                           verbose = False)


# Default CatBoostClassifier Pipeline
catboost_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', catboost_default_raw)])

catboost_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline),
                                         ('model', catboost_default9_raw)])

# Define the hyperparameter search space
catboost_param_distributions = {
    'model__iterations': scipy.stats.randint(10, 300),
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
    'model__min_data_in_leaf': scipy.stats.randint(10, 50),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

catboost_tune_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                   param_distributions = catboost_param_distributions, 
                                   n_iter = 100,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)


catboost_tune9_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                        param_distributions = catboost_param_distributions, 
                                        n_iter = 100,
                                        cv = stratified_kfold,
                                        scoring = 'balanced_accuracy', 
                                        random_state = 1234,
                                        n_jobs = -1)

In [36]:
tic = time.time()

catboost_default = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken = toc-tic

In [37]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken)
catboost_default

Time taken:  2.9316344261169434


In [38]:
tic = time.time()

catboost_default9 = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken9 = toc-tic

In [39]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken9)
catboost_default9

Time taken:  2.7104270458221436


In [112]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred = catboost_default.predict(X_test)
catboost_default_accuracy = balanced_accuracy_score(y_test, y_catboost_default_pred)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy)
time_summary.append(catboost_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.9111887836540356


In [113]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred9 = catboost_default9.predict(X_test)
catboost_default_accuracy9 = balanced_accuracy_score(y_test, y_catboost_default_pred9)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy9)
time_summary.append(catboost_default_time_taken9)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.9111887836540356


In [40]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred = catboost_default.predict(X_test)
catboost_default_accuracy = balanced_accuracy_score(y_test, y_catboost_default_pred)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy}')
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred9 = catboost_default9.predict(X_test)
catboost_default_accuracy9 = balanced_accuracy_score(y_test, y_catboost_default_pred9)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy9}')


Balanced accuracy with default parameters: 0.9084810138084147
Balanced accuracy with default parameters: 0.9084810138084147


In [41]:
tic = time.time()

catboost_tune = catboost_tune_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken = toc-tic

In [42]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken)
catboost_tune

Time taken:  159.5653669834137


In [43]:
tic = time.time()

catboost_tune9 = catboost_tune9_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken9 = toc-tic

In [44]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken9)
catboost_tune9

Time taken:  152.8892126083374


In [118]:
# Predict using the model with the best parameters
y_catboost_tune_pred = catboost_tune.predict(X_test)

# Get the best parameters
catboost_tune_best_params = catboost_tune.best_params_
print(f'Best parameters: {catboost_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy = balanced_accuracy_score(y_test, y_catboost_tune_pred)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy)
time_summary.append(catboost_tune_time_taken)
n_models_summary.append(catboost_tune.n_iter)

Best parameters: {'model__iterations': 161, 'model__learning_rate': 0.12262148055008154, 'model__min_data_in_leaf': 30}
Balanced accuracy with best parameters: 0.9205357386244892


In [119]:
# Predict using the model with the best parameters
y_catboost_tune_pred9 = catboost_tune9.predict(X_test)

# Get the best parameters
catboost_tune_best_params9 = catboost_tune9.best_params_
print(f'Best parameters: {catboost_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy9 = balanced_accuracy_score(y_test, y_catboost_tune_pred9)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy9)
time_summary.append(catboost_tune_time_taken9)
n_models_summary.append(catboost_tune9.n_iter)

Best parameters: {'model__iterations': 161, 'model__learning_rate': 0.12262148055008154, 'model__min_data_in_leaf': 30}
Balanced accuracy with best parameters: 0.9205357386244892


### Results Summary

In [120]:
results_summary = pd.DataFrame({"Dataset":"churn",
                                "Model":"HistGradientBoosting",
                                "Variables":card_9_summary,
                                "Default/Tune":default_summary,
                                "Encoder":encoder_summary,
                                "Metric":"BalancedAccuracy",
                                "Value":value_summary,
                                "Time":time_summary,
                                "Iterations":n_models_summary})
results_summary["mean_Time"] = (results_summary["Time"] / results_summary["Iterations"])
results_summary

Unnamed: 0,Dataset,Model,Variables,Default/Tune,Encoder,Metric,Value,Time,Iterations,mean_Time
0,churn,HistGradientBoosting,AllVariables,Default,OneHotEncoding,BalancedAccuracy,0.887469,1.315982,1,1.315982
1,churn,HistGradientBoosting,AllVariables,Tune,OneHotEncoding,BalancedAccuracy,0.889024,95.508237,100,0.955082
2,churn,HistGradientBoosting,AllVariables,Default,CountEncoding,BalancedAccuracy,0.904636,0.605803,1,0.605803
3,churn,HistGradientBoosting,OnlyVariablesWithCard>9,Default,CountEncoding,BalancedAccuracy,0.904636,0.650198,1,0.650198
4,churn,HistGradientBoosting,AllVariables,Tune,CountEncoding,BalancedAccuracy,0.899048,25.017952,300,0.083393
5,churn,HistGradientBoosting,OnlyVariablesWithCard>9,Tune,CountEncoding,BalancedAccuracy,0.899048,21.505853,100,0.215059
6,churn,HistGradientBoosting,AllVariables,Default,OrdinalEncoder,BalancedAccuracy,0.901756,0.579818,1,0.579818
7,churn,HistGradientBoosting,OnlyVariablesWithCard>9,Default,OrdinalEncoder,BalancedAccuracy,0.911427,0.622786,1,0.622786
8,churn,HistGradientBoosting,AllVariables,Tune,OrdinalEncoder,BalancedAccuracy,0.916892,21.227329,100,0.212273
9,churn,HistGradientBoosting,OnlyVariablesWithCard>9,Tune,OrdinalEncoder,BalancedAccuracy,0.903253,20.52982,100,0.205298


In [121]:
results_summary.to_csv("churn_results.csv")