# "Kick" Dataset

In [34]:
from scipy.io import arff
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency


In [35]:
from sklearn.datasets import fetch_openml

data = fetch_openml(data_id=41162, parser='auto')

# The returned dataset is a Bunch object, similar to a dictionary
X = data['data']
y = data['target']

In [36]:
# Summary vectors creation

default_summary  = []
encoder_summary  = []
value_summary    = []
time_summary     = []
n_models_summary = []
card_9_summary   = []

### Description

One of the biggest challenges of an auto dealership purchasing a used car at an auto auction is the risk of that the vehicle might have serious issues that prevent it from being sold to customers. The auto community calls these unfortunate purchases "kicks".

Kicked cars often result when there are tampered odometers, mechanical issues the dealer is not able to address, issues with getting the vehicle title from the seller, or some other unforeseen problem. Kick cars can be very costly to dealers after transportation cost, throw-away repair work, and market losses in reselling the vehicle.

Modelers who can figure out which cars have a higher risk of being kick can provide real value to dealerships trying to provide the best inventory selection possible to their customers.

The challenge of this competition is to predict if the car purchased at the Auction is a Kick (bad buy).

In [37]:
X.head()

Unnamed: 0,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,Transmission,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1260144000,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,RED,AUTO,...,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,1260144000,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,WHITE,AUTO,...,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,1260144000,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,MAROON,AUTO,...,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,1260144000,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,SILVER,AUTO,...,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,1260144000,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,SILVER,MANUAL,...,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


In [38]:
X.columns

Index(['PurchDate', 'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model',
       'Trim', 'SubModel', 'Color', 'Transmission', 'WheelTypeID', 'WheelType',
       'VehOdo', 'Nationality', 'Size', 'TopThreeAmericanName',
       'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
       'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
       'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
       'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice',
       'PRIMEUNIT', 'AUCGUART', 'BYRNO', 'VNZIP1', 'VNST', 'VehBCost',
       'IsOnlineSale', 'WarrantyCost'],
      dtype='object')

In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Data columns (total 32 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   PurchDate                          72983 non-null  int64   
 1   Auction                            72983 non-null  category
 2   VehYear                            72983 non-null  int64   
 3   VehicleAge                         72983 non-null  int64   
 4   Make                               72983 non-null  category
 5   Model                              72983 non-null  category
 6   Trim                               70623 non-null  category
 7   SubModel                           72975 non-null  category
 8   Color                              72975 non-null  category
 9   Transmission                       72974 non-null  category
 10  WheelTypeID                        69814 non-null  category
 11  WheelType                          69809 

In [40]:
X.shape

(72983, 32)

In [41]:
X.describe()

Unnamed: 0,PurchDate,VehYear,VehicleAge,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,VehBCost,WarrantyCost
count,72983.0,72983.0,72983.0,72983.0,72965.0,72965.0,72965.0,72965.0,72668.0,72668.0,72668.0,72668.0,72915.0,72983.0
mean,1263092000.0,2005.343052,4.176644,71499.995917,6128.909217,7373.636031,8497.034332,9850.92824,6132.081287,7390.681827,8775.723331,10145.385314,6729.249949,1276.580985
std,18196020.0,1.731252,1.71221,14578.913128,2461.992768,2722.491986,3156.285284,3385.789541,2434.567723,2686.248852,3090.702941,3310.254351,1764.962643,598.846788
min,1231114000.0,2001.0,0.0,4825.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,462.0
25%,1248048000.0,2004.0,3.0,61837.0,4273.0,5406.0,6280.0,7493.0,4275.0,5414.0,6536.0,7784.0,5430.0,837.0
50%,1264032000.0,2005.0,4.0,73361.0,6097.0,7303.0,8444.0,9789.0,6062.0,7313.0,8729.0,10103.0,6700.0,1155.0
75%,1279066000.0,2007.0,5.0,82436.0,7765.0,9021.0,10651.0,12088.0,7736.0,9013.0,10911.0,12309.0,7900.0,1623.0
max,1293667000.0,2010.0,9.0,115717.0,35722.0,36859.0,39080.0,41482.0,35722.0,36859.0,39080.0,41062.0,45469.0,7498.0


Check the unique values of all variables.

In [42]:
X.nunique().sort_values(ascending=False)

VehOdo                               39947
MMRAcquisitonRetailCleanPrice        13456
MMRCurrentRetailCleanPrice           13192
MMRAcquisitionRetailAveragePrice     12725
MMRCurrentRetailAveragePrice         12493
MMRAcquisitionAuctionCleanPrice      11379
MMRCurrentAuctionCleanPrice          11265
MMRAcquisitionAuctionAveragePrice    10342
MMRCurrentAuctionAveragePrice        10315
VehBCost                              2010
Model                                 1063
SubModel                               863
PurchDate                              517
WarrantyCost                           281
VNZIP1                                 153
Trim                                   134
BYRNO                                   74
VNST                                    37
Make                                    33
Color                                   16
Size                                    12
VehicleAge                              10
VehYear                                 10
WheelTypeID

We check for duplicate rows.

In [43]:
X.duplicated().sum()

0

## Study of NA's

In [44]:
X.isna().sum().sort_values(ascending = False)

AUCGUART                             69564
PRIMEUNIT                            69564
WheelType                             3174
WheelTypeID                           3169
Trim                                  2360
MMRCurrentRetailCleanPrice             315
MMRCurrentRetailAveragePrice           315
MMRCurrentAuctionCleanPrice            315
MMRCurrentAuctionAveragePrice          315
VehBCost                                68
MMRAcquisitonRetailCleanPrice           18
MMRAcquisitionRetailAveragePrice        18
MMRAcquisitionAuctionCleanPrice         18
MMRAcquisitionAuctionAveragePrice       18
Transmission                             9
Color                                    8
SubModel                                 8
Size                                     5
TopThreeAmericanName                     5
Nationality                              5
VNZIP1                                   0
VNST                                     0
IsOnlineSale                             0
BYRNO      

As can be seen, there are 2 variables whose 95% are np.nan. Therefore we will eliminate these two features. The rest, we will impute them in our preprocessing pipelines.

In [45]:
X = X.drop(columns=['AUCGUART', 'PRIMEUNIT'])

## Type of Variables

In [46]:
num_cols = X.select_dtypes(exclude=['object','category']).columns
binary_cols = [col for col in X.columns if X[col].isin([0, 1, 0.0, 1.0, '1','0']).all()]
num_cols = [num_col for num_col in num_cols if num_col not in binary_cols]

print(f'Numeric columns: {len(num_cols)}')
print(f'Binary columns: {len(binary_cols)}')

Numeric columns: 14
Binary columns: 1


In [47]:
for col in binary_cols:
    X[col] = X[col].astype(float)

In [48]:
cat_columns = X.select_dtypes(include=['category','object']).columns
print(f'Category columns: {len(cat_columns)}')

Category columns: 15


General review of the values of all variables.

In [49]:
X[X.select_dtypes(include=['category']).columns].apply(lambda col: col.nunique()).sort_values(ascending=False)

Model                   1063
SubModel                 863
VNZIP1                   153
Trim                     134
BYRNO                     74
VNST                      37
Make                      33
Color                     16
Size                      12
WheelTypeID                4
Nationality                4
TopThreeAmericanName       4
Auction                    3
Transmission               3
WheelType                  3
dtype: int64

## Value counts of the variables with more cardinality

#### Model

In [50]:
X.Model.value_counts()[0:10]

Model
PT CRUISER              2329
IMPALA                  1990
TAURUS                  1425
CALIBER                 1375
CARAVAN GRAND FWD V6    1289
MALIBU 4C               1225
TAURUS 3.0L V6 EFI      1160
SEBRING 4C              1157
COBALT                  1106
PT CRUISER 2.4L I4 S    1104
Name: count, dtype: int64

#### SubModel

In [51]:
X.SubModel.value_counts()[0:10]

SubModel
4D SEDAN            15236
4D SEDAN LS          4718
4D SEDAN SE          3859
4D WAGON             2230
MINIVAN 3.3L         1258
4D SUV 4.2L LS       1193
4D SEDAN LT          1129
4D SEDAN SXT FFV     1094
2D COUPE             1072
4D SEDAN LX          1068
Name: count, dtype: int64

#### VNZIP1

In [52]:
X.VNZIP1.value_counts()[0:10]

VNZIP1
32824    3699
27542    3402
75236    2431
74135    2321
80022    2118
85226    2086
85040    2012
29697    1999
95673    1970
28273    1887
Name: count, dtype: int64

## Response variable distribution

In [53]:
y.value_counts()

IsBadBuy
0    64007
1     8976
Name: count, dtype: int64

In [54]:
y.value_counts(normalize=True)

IsBadBuy
0    0.877012
1    0.122988
Name: proportion, dtype: float64

## Train-Test Split

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size = 0.33, 
                                                    random_state = 42,
                                                    stratify = y)

## Pipelines (Encoding in all variables)

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.metrics import balanced_accuracy_score

import scipy.stats
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

import time

In [58]:
num_cols = X_train.select_dtypes(include=['number']).columns.to_list()
cat_cols = X_train.select_dtypes(include=['category','object']).columns.to_list()

In [59]:
cat_cols_less9 = [col for col in cat_cols if X[col].nunique() <= 9]
cat_cols_more9 = [col for col in cat_cols if X[col].nunique() > 9]

In [60]:
# Define the HistGradientBoostingClassifier models
hgb_default = HistGradientBoostingClassifier(random_state=1234,
                                             max_iter = 50,
                                             early_stopping=False,
                                             min_samples_leaf = 30,
                                             scoring='balanced_accuracy',
                                             class_weight='balanced')

hgb_default9 = HistGradientBoostingClassifier(random_state=1234,
                                              max_iter = 50,
                                              early_stopping=False,
                                              min_samples_leaf = 30,
                                              scoring='balanced_accuracy',
                                              class_weight='balanced')


# Define the hyperparameter search space
param_distributions = {
    'model__max_iter': scipy.stats.randint(10,150),
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
    'model__min_samples_leaf': scipy.stats.randint(10, 50),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

### One Hot Encoding + HistGradientBoosting

#### Preprocessing

In [61]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [62]:
ohe_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                     ('model', hgb_default)])

In [63]:
tic = time.time()

ohe_hgb_default = ohe_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ohe_hgb_default_time_taken = toc-tic

In [64]:
# Display pipeline
print("Time taken: ", ohe_hgb_default_time_taken)
ohe_hgb_default

Time taken:  28.22479796409607


In [65]:
# Predict using the model with the best parameters
y_ohe_hgb_default_pred = ohe_hgb_default.predict(X_test)
ohe_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ohe_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_default_accuracy)
time_summary.append(ohe_hgb_default_time_taken)
n_models_summary.append(1)



Balanced accuracy with default parameters: 0.6578025693885238


#### Create a HistGradientBoostingClassifier model for tuning

In [66]:
ohe_hgb_tune = RandomizedSearchCV(estimator = ohe_hgb_default_pipeline, 
                                  param_distributions = param_distributions, 
                                  n_iter = 20,
                                  cv = stratified_kfold,
                                  scoring = 'balanced_accuracy', 
                                  random_state = 1234,
                                  n_jobs = -1)

In [67]:
tic = time.time()

ohe_hgb_tune = ohe_hgb_tune.fit(X_train, y_train)

toc = time.time()
ohe_hgb_tune_time_taken = toc-tic

In [68]:
# Display pipeline
print("Time taken: ", ohe_hgb_tune_time_taken)
ohe_hgb_tune

Time taken:  826.6504507064819


In [69]:
# Predict using the model with the best parameters
y_ohe_hgb_tune_pred = ohe_hgb_tune.predict(X_test)

# Get the best parameters
ohe_hgb_tune_best_params = ohe_hgb_tune.best_params_
print(f'Best parameters: {ohe_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ohe_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ohe_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_tune_accuracy)
time_summary.append(ohe_hgb_tune_time_taken)
n_models_summary.append(ohe_hgb_tune.n_iter)



Best parameters: {'model__learning_rate': 0.0695556277078513, 'model__max_iter': 134, 'model__min_samples_leaf': 36}
Balanced accuracy with best parameters: 0.661100685375069


### Count Encoder + HistGradientBoosting


In [70]:
from category_encoders.count import CountEncoder

#### Preprocessing

In [71]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", CountEncoder())
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("count_encoder", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("count_encoder", cat_pipeline_more9, cat_cols_more9),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [72]:
count_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])
count_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                       ('model', hgb_default9)])

In [73]:
tic = time.time()

count_hgb_default = count_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
count_hgb_default_time_taken = toc-tic

In [74]:
# Display pipeline
print("Time taken: ", count_hgb_default_time_taken)
count_hgb_default

Time taken:  1.2162466049194336


In [75]:
tic = time.time()

count_hgb_default9 = count_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
count_hgb_default9_time_taken = toc-tic

In [76]:
# Display pipeline
print("Time taken: ", count_hgb_default9_time_taken)
count_hgb_default9

Time taken:  1.1777403354644775


In [77]:
# Calculate balanced accuracy for the model with default parameters
y_count_hgb_default_pred = count_hgb_default.predict(X_test)
count_hgb_default_accuracy = balanced_accuracy_score(y_test, y_count_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy)
time_summary.append(count_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6579489020979112


In [78]:
# Calculate balanced accuracy for the model with default parameters
y_count_hgb_default_pred9 = count_hgb_default9.predict(X_test)
count_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_count_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy9)
time_summary.append(count_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6538987841478817


#### Create a HistGradientBoostingClassifier model for tuning

In [79]:
count_hgb_tune = RandomizedSearchCV(estimator = count_hgb_default_pipeline, 
                                   param_distributions = param_distributions, 
                                   n_iter = 20,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

count_hgb_tune9 = RandomizedSearchCV(estimator = count_hgb_default_pipeline9, 
                                   param_distributions = param_distributions, 
                                   n_iter = 20,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

In [80]:
tic = time.time()

count_hgb_tune = count_hgb_tune.fit(X_train, y_train)

toc = time.time()
count_hgb_tune_time_taken = toc-tic

In [81]:
# Display pipeline
print("Time taken: ", count_hgb_tune_time_taken)
count_hgb_tune

Time taken:  20.127923488616943


In [82]:
tic = time.time()

count_hgb_tune9 = count_hgb_tune9.fit(X_train, y_train)

toc = time.time()
count_hgb_tune9_time_taken = toc-tic

In [83]:
# Display pipeline
print("Time taken: ", count_hgb_tune9_time_taken)
count_hgb_tune9

Time taken:  19.18990397453308


In [84]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred = count_hgb_tune.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params = count_hgb_tune.best_params_
print(f'Best parameters: {count_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_count_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy)
time_summary.append(count_hgb_tune_time_taken)
n_models_summary.append(count_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.03428408698855421, 'model__max_iter': 143, 'model__min_samples_leaf': 25}
Balanced accuracy with best parameters: 0.6587269324396641


In [85]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred9 = count_hgb_tune9.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params9 = count_hgb_tune9.best_params_
print(f'Best parameters: {count_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_count_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy9)
time_summary.append(count_hgb_tune9_time_taken)
n_models_summary.append(count_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.03428408698855421, 'model__max_iter': 143, 'model__min_samples_leaf': 25}
Balanced accuracy with best parameters: 0.6574979566484374


### Ordinal Encoding + HistGradientBoosting

In [86]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [87]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999))
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("count_encoder", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("ordinal_encoder", cat_pipeline_more9, cat_cols_more9),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [88]:
ordinal_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])

ordinal_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                       ('model', hgb_default9)])

In [89]:
tic = time.time()

ordinal_hgb_default = ordinal_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default_time_taken = toc-tic

In [90]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default_time_taken)
ordinal_hgb_default

Time taken:  0.8619387149810791


In [91]:
tic = time.time()

ordinal_hgb_default9 = ordinal_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default9_time_taken = toc-tic

In [92]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default9_time_taken)
ordinal_hgb_default9

Time taken:  1.0342321395874023


In [93]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred = ordinal_hgb_default.predict(X_test)
ordinal_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy)
time_summary.append(ordinal_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6544302921031355


In [94]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred9 = ordinal_hgb_default9.predict(X_test)
ordinal_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy9)
time_summary.append(ordinal_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6555559695162538


#### Create a HistGradientBoostingClassifier model for tuning

In [95]:
ordinal_hgb_tune = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline, 
                                      param_distributions = param_distributions, 
                                      n_iter = 20,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

ordinal_hgb_tune9 = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline9, 
                                       param_distributions = param_distributions, 
                                       n_iter = 20,
                                       cv = stratified_kfold,
                                       scoring = 'balanced_accuracy', 
                                       random_state = 1234,
                                       n_jobs = -1)

In [96]:
tic = time.time()

ordinal_hgb_tune = ordinal_hgb_tune.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune_time_taken = toc-tic

In [97]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune_time_taken)
ordinal_hgb_tune

Time taken:  14.685995101928711


In [98]:
tic = time.time()

ordinal_hgb_tune9 = ordinal_hgb_tune9.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune9_time_taken = toc-tic

In [99]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune9_time_taken)
ordinal_hgb_tune9

Time taken:  17.75323748588562


In [100]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred = ordinal_hgb_tune.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params = ordinal_hgb_tune.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy)
time_summary.append(ordinal_hgb_tune_time_taken)
n_models_summary.append(ordinal_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.04523301690773688, 'model__max_iter': 125, 'model__min_samples_leaf': 27}
Balanced accuracy with best parameters: 0.6570259615372014


In [101]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred9 = ordinal_hgb_tune9.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params9 = ordinal_hgb_tune9.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy9)
time_summary.append(ordinal_hgb_tune9_time_taken)
n_models_summary.append(ordinal_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.044841348285564624, 'model__max_iter': 130, 'model__min_samples_leaf': 38}
Balanced accuracy with best parameters: 0.6582257602915664


### Native HistGradientBoosting support for categorical variables


In [102]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [103]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999,
                               max_categories = 254))
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("ordinal_encoding", cat_pipeline_more9, cat_cols_more9)],
    sparse_threshold=0
)

In [104]:
category_features_for_nativesupport = [False]*len(num_cols) + [True]*len(cat_cols)

X_train_check = preprop_pipeline9.fit_transform(X_train)
category_features_for_nativesupport9 = [False]*(X_train_check.shape[1]-len(cat_cols_more9)) + [True]*len(cat_cols_more9)
print(category_features_for_nativesupport)
print(category_features_for_nativesupport9)

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True]


In [105]:
hgb_default_categories_support = HistGradientBoostingClassifier(random_state=1234,
                                                                max_iter = 50,
                                                                early_stopping=False,
                                                                min_samples_leaf = 30,
                                                                scoring='balanced_accuracy',
                                                                class_weight='balanced',
                                                                categorical_features=category_features_for_nativesupport)

hgb_default_categories_support9 = HistGradientBoostingClassifier(random_state=1234,
                                                                 max_iter = 50,
                                                                 early_stopping=False,
                                                                 min_samples_leaf = 30,
                                                                 scoring='balanced_accuracy',
                                                                 class_weight='balanced',
                                                                 categorical_features=category_features_for_nativesupport9,)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [106]:
catsup_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', hgb_default_categories_support)])

catsup_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                        ('model', hgb_default_categories_support9)])

In [107]:
tic = time.time()

catsup_hgb_default = catsup_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default_time_taken = toc-tic

In [108]:
# Display pipeline
print("Time taken: ", catsup_hgb_default_time_taken)
catsup_hgb_default

Time taken:  1.8780512809753418


In [109]:
tic = time.time()

catsup_hgb_default9 = catsup_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default9_time_taken = toc-tic

In [110]:
# Display pipeline
print("Time taken: ", catsup_hgb_default9_time_taken)
catsup_hgb_default9

Time taken:  1.9042460918426514


In [111]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred = catsup_hgb_default.predict(X_test)
catsup_hgb_default_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy)
time_summary.append(catsup_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6221021112858697


In [112]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred9 = catsup_hgb_default9.predict(X_test)
catsup_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy9)
time_summary.append(catsup_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6241782200859932


#### Create a HistGradientBoostingClassifier model for tuning

In [113]:
catsup_hgb_tune = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline, 
                                     param_distributions = param_distributions, 
                                     n_iter = 20,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

catsup_hgb_tune9 = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline9, 
                                      param_distributions = param_distributions, 
                                      n_iter = 20,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

In [114]:
tic = time.time() 

catsup_hgb_tune = catsup_hgb_tune.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune_time_taken = toc-tic

In [115]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune_time_taken)
catsup_hgb_tune

Time taken:  19.700146675109863


In [116]:
tic = time.time() 

catsup_hgb_tune9 = catsup_hgb_tune9.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune9_time_taken = toc-tic

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\VNG\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\VNG\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\VNG\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\VNG\AppData\Local\Progr

In [117]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune9_time_taken)
catsup_hgb_tune9

Time taken:  16.891465425491333


In [118]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred = catsup_hgb_tune.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params = catsup_hgb_tune.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy)
time_summary.append(catsup_hgb_tune_time_taken)
n_models_summary.append(catsup_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.02420658364045454, 'model__max_iter': 80, 'model__min_samples_leaf': 32}
Balanced accuracy with best parameters: 0.6398025832618013


In [119]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred9 = catsup_hgb_tune9.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params9 = catsup_hgb_tune9.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy9)
time_summary.append(catsup_hgb_tune9_time_taken)
n_models_summary.append(catsup_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.06745583511366768, 'model__max_iter': 48, 'model__min_samples_leaf': 22}
Balanced accuracy with best parameters: 0.637074662175305


### Target Encoder (scikit-learn)

In [120]:
from sklearn.preprocessing import TargetEncoder

#### Preprocessing

In [121]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", TargetEncoder())
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("target_encoding", cat_pipeline_more9, cat_cols_more9)
                    ],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [122]:
target_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])

target_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                       ('model', hgb_default9)])

In [123]:
tic = time.time()

target_hgb_default = target_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
target_hgb_default_time_taken = toc-tic

In [124]:
# Display pipeline
print("Time taken: ", target_hgb_default_time_taken)
target_hgb_default

Time taken:  0.868736743927002


In [125]:
tic = time.time()

target_hgb_default9 = target_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
target_hgb_default9_time_taken = toc-tic

In [126]:
# Display pipeline
print("Time taken: ", target_hgb_default9_time_taken)
target_hgb_default9

Time taken:  1.014829158782959


In [127]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred = target_hgb_default.predict(X_test)
target_hgb_default_accuracy = balanced_accuracy_score(y_test, y_target_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy)
time_summary.append(target_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.657313120799198


In [128]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred9 = target_hgb_default9.predict(X_test)
target_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_target_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy9)
time_summary.append(target_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6566591747771797


#### Create a HistGradientBoostingClassifier model for tuning

In [129]:
target_hgb_tune = RandomizedSearchCV(estimator = target_hgb_default_pipeline, 
                                     param_distributions = param_distributions, 
                                     n_iter = 20,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

target_hgb_tune9 = RandomizedSearchCV(estimator = target_hgb_default_pipeline9, 
                                      param_distributions = param_distributions, 
                                      n_iter = 20,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

In [130]:
tic = time.time() 

target_hgb_tune = target_hgb_tune.fit(X_train, y_train)

toc = time.time()
target_hgb_tune_time_taken = toc-tic

In [131]:
# Display pipeline
print("Time taken: ", target_hgb_tune_time_taken)
target_hgb_tune

Time taken:  15.702942371368408


In [132]:
tic = time.time() 

target_hgb_tune9 = target_hgb_tune9.fit(X_train, y_train)

toc = time.time()
target_hgb_tune9_time_taken = toc-tic

In [133]:
# Display pipeline
print("Time taken: ", target_hgb_tune9_time_taken)
target_hgb_tune9

Time taken:  20.00809955596924


In [134]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred = target_hgb_tune.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params = target_hgb_tune.best_params_
print(f'Best parameters: {target_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_target_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy)
time_summary.append(target_hgb_tune_time_taken)
n_models_summary.append(target_hgb_tune.n_iter)

Best parameters: {'model__learning_rate': 0.03428408698855421, 'model__max_iter': 143, 'model__min_samples_leaf': 25}
Balanced accuracy with best parameters: 0.6553405341397224


In [135]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred9 = target_hgb_tune9.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params9 = target_hgb_tune9.best_params_
print(f'Best parameters: {target_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_target_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy9)
time_summary.append(target_hgb_tune9_time_taken)
n_models_summary.append(target_hgb_tune9.n_iter)

Best parameters: {'model__learning_rate': 0.03428408698855421, 'model__max_iter': 143, 'model__min_samples_leaf': 25}
Balanced accuracy with best parameters: 0.6593669812096685


### CatBoost

In [136]:
from catboost import CatBoostClassifier

#### Preprocessing

In [137]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat_less9", cat_pipeline, cat_cols_less9),
                    ("cat_more9", cat_pipeline, cat_cols_more9)],
    sparse_threshold=0
)

In [138]:
category_features_for_catboostsupport  = [index for index in range(len(num_cols), len(num_cols) + len(cat_cols))]
category_features_for_catboostsupport9 = [index for index in range(len(num_cols) + len(cat_cols_less9), len(num_cols) + len(cat_cols))]
print(category_features_for_catboostsupport)
print(category_features_for_catboostsupport9)

[15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[21, 22, 23, 24, 25, 26, 27, 28, 29]


Catboost allows to give a maximum value of unique categories for which a variable is encoded or not by One-Hot-Encoder.

In [142]:
# Create catboost models
catboost_default_raw = CatBoostClassifier(iterations=50,
                                          eval_metric = 'BalancedAccuracy',
                                          loss_function = 'Logloss',
                                          auto_class_weights = 'Balanced',
                                          od_type='Iter',
                                          one_hot_max_size = 0,
                                          random_seed = 1234,
                                          min_data_in_leaf = 30,
                                          cat_features=category_features_for_catboostsupport,
                                          verbose = False)

catboost_default9_raw = CatBoostClassifier(iterations=50,
                                           eval_metric = 'BalancedAccuracy',
                                           loss_function = 'Logloss',
                                           auto_class_weights = 'Balanced',
                                           od_type='Iter',
                                           one_hot_max_size = 9,
                                           random_seed = 1234,
                                           min_data_in_leaf = 30,
                                           cat_features=category_features_for_catboostsupport9,
                                           verbose = False)

catboost_default_raw.set_params(cat_features=category_features_for_catboostsupport)
catboost_default9_raw.set_params(cat_features=category_features_for_catboostsupport9)


# Default CatBoostClassifier Pipeline
catboost_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                     ('model', catboost_default_raw)])

catboost_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', catboost_default9_raw)])

# Define the hyperparameter search space
catboost_param_distributions = {
    'model__iterations': scipy.stats.randint(10, 150),
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
    'model__min_data_in_leaf': scipy.stats.randint(10, 50),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

catboost_tune_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                       param_distributions = catboost_param_distributions, 
                                       n_iter = 20,
                                       cv = stratified_kfold,
                                       scoring = 'balanced_accuracy', 
                                       random_state = 1234,
                                       n_jobs = -1)


catboost_tune9_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                        param_distributions = catboost_param_distributions, 
                                        n_iter = 20,
                                        cv = stratified_kfold,
                                        scoring = 'balanced_accuracy', 
                                        random_state = 1234,
                                        n_jobs = -1)

In [143]:
tic = time.time()

catboost_default = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken = toc-tic

In [144]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken)
catboost_default

Time taken:  2.256887197494507


In [145]:
tic = time.time()

catboost_default9 = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken9 = toc-tic

In [146]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken9)
catboost_default9

Time taken:  2.2381973266601562


In [147]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred = catboost_default.predict(X_test)
catboost_default_accuracy = balanced_accuracy_score(y_test, y_catboost_default_pred)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy)
time_summary.append(catboost_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6574422237930353


In [148]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred9 = catboost_default9.predict(X_test)
catboost_default_accuracy9 = balanced_accuracy_score(y_test, y_catboost_default_pred9)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy9)
time_summary.append(catboost_default_time_taken9)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.6574422237930353


In [149]:
tic = time.time()

catboost_tune = catboost_tune_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken = toc-tic

In [150]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken)
catboost_tune

Time taken:  95.24941396713257


In [151]:
tic = time.time()

catboost_tune9 = catboost_tune9_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken9 = toc-tic

In [152]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken9)
catboost_tune9

Time taken:  100.75179505348206


In [153]:
# Predict using the model with the best parameters
y_catboost_tune_pred = catboost_tune.predict(X_test)

# Get the best parameters
catboost_tune_best_params = catboost_tune.best_params_
print(f'Best parameters: {catboost_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy = balanced_accuracy_score(y_test, y_catboost_tune_pred)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy)
time_summary.append(catboost_tune_time_taken)
n_models_summary.append(catboost_tune.n_iter)

Best parameters: {'model__iterations': 123, 'model__learning_rate': 0.10837362525955054, 'model__min_data_in_leaf': 41}
Balanced accuracy with best parameters: 0.6547047927986054


In [154]:
# Predict using the model with the best parameters
y_catboost_tune_pred9 = catboost_tune9.predict(X_test)

# Get the best parameters
catboost_tune_best_params9 = catboost_tune9.best_params_
print(f'Best parameters: {catboost_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy9 = balanced_accuracy_score(y_test, y_catboost_tune_pred9)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy9)
time_summary.append(catboost_tune_time_taken9)
n_models_summary.append(catboost_tune9.n_iter)

Best parameters: {'model__iterations': 123, 'model__learning_rate': 0.10837362525955054, 'model__min_data_in_leaf': 41}
Balanced accuracy with best parameters: 0.6547047927986054


### Results Summary

In [160]:
results_summary = pd.DataFrame({"Dataset":"kick",
                                "Variables":card_9_summary,
                                "Default/Tune":default_summary,
                                "Encoder":encoder_summary,
                                "Metric":"BalancedAccuracy",
                                "Value":value_summary,
                                "Time":time_summary,
                                "Iterations":n_models_summary})
results_summary["mean_Time"] = (results_summary["Time"] / results_summary["Iterations"])
results_summary

Unnamed: 0,Dataset,Variables,Default/Tune,Encoder,Metric,Value,Time,Iterations,mean_Time
0,kick,AllVariables,Default,OneHotEncoding,BalancedAccuracy,0.657803,28.224798,1,28.224798
1,kick,AllVariables,Tune,OneHotEncoding,BalancedAccuracy,0.661101,826.650451,20,41.332523
2,kick,AllVariables,Default,CountEncoding,BalancedAccuracy,0.657949,1.216247,1,1.216247
3,kick,OnlyVariablesWithCard>9,Default,CountEncoding,BalancedAccuracy,0.653899,1.17774,1,1.17774
4,kick,AllVariables,Tune,CountEncoding,BalancedAccuracy,0.658727,20.127923,20,1.006396
5,kick,OnlyVariablesWithCard>9,Tune,CountEncoding,BalancedAccuracy,0.657498,19.189904,20,0.959495
6,kick,AllVariables,Default,OrdinalEncoder,BalancedAccuracy,0.65443,0.861939,1,0.861939
7,kick,OnlyVariablesWithCard>9,Default,OrdinalEncoder,BalancedAccuracy,0.655556,1.034232,1,1.034232
8,kick,AllVariables,Tune,OrdinalEncoder,BalancedAccuracy,0.657026,14.685995,20,0.7343
9,kick,OnlyVariablesWithCard>9,Tune,OrdinalEncoder,BalancedAccuracy,0.658226,17.753237,20,0.887662


In [161]:
results_summary.to_csv("kick_results.csv")