# "kdd Internet Usage" Dataset

In [1]:
# data manipulation
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import fetch_openml

data = fetch_openml(data_id=981, parser='auto')

# The returned dataset is a Bunch object, similar to a dictionary
X = data['data']
X = pd.DataFrame(X)
y = data['target']

In [3]:
# Summary vectors creation

default_summary  = []
encoder_summary  = []
value_summary    = []
time_summary     = []
n_models_summary = []
card_9_summary   = []

In [4]:
X.head()

Unnamed: 0,Actual_Time,Age,Community_Building,Community_Membership_Family,Community_Membership_Hobbies,Community_Membership_None,Community_Membership_Other,Community_Membership_Political,Community_Membership_Professional,Community_Membership_Religious,...,Not_Purchasing_Unfamiliar_vendor,Registered_to_Vote,Sexual_Preference,Web_Ordering,Web_Page_Creation,Who_Pays_for_Access_Dont_Know,Who_Pays_for_Access_Other,Who_Pays_for_Access_Parents,Who_Pays_for_Access_School,Who_Pays_for_Access_Self
0,Consultant,41,Equally,0,0,1,0,0,0,0,...,0,Yes,Heterosexual,Yes,Yes,0,0,0,0,1
1,College_Student,28,Equally,0,0,0,0,0,0,0,...,0,Yes,Heterosexual,Yes,No,0,0,0,0,1
2,Other,25,More,1,1,0,0,0,1,0,...,1,No,Heterosexual,Yes,Yes,0,0,0,0,1
3,Salesperson,28,More,0,0,0,1,0,0,0,...,0,Yes,Heterosexual,Yes,Yes,0,0,0,0,1
4,K-12_Student,17,More,0,0,0,0,1,1,0,...,0,No,Heterosexual,Yes,Yes,0,0,0,0,1


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 68 columns):
 #   Column                                    Non-Null Count  Dtype   
---  ------                                    --------------  -----   
 0   Actual_Time                               10108 non-null  category
 1   Age                                       10108 non-null  category
 2   Community_Building                        10108 non-null  category
 3   Community_Membership_Family               10108 non-null  category
 4   Community_Membership_Hobbies              10108 non-null  category
 5   Community_Membership_None                 10108 non-null  category
 6   Community_Membership_Other                10108 non-null  category
 7   Community_Membership_Political            10108 non-null  category
 8   Community_Membership_Professional         10108 non-null  category
 9   Community_Membership_Religious            10108 non-null  category
 10  Community_Membership_S

In [6]:
X.shape

(10108, 68)

In [7]:
X.describe()

Unnamed: 0,Actual_Time,Age,Community_Building,Community_Membership_Family,Community_Membership_Hobbies,Community_Membership_None,Community_Membership_Other,Community_Membership_Political,Community_Membership_Professional,Community_Membership_Religious,...,Not_Purchasing_Unfamiliar_vendor,Registered_to_Vote,Sexual_Preference,Web_Ordering,Web_Page_Creation,Who_Pays_for_Access_Dont_Know,Who_Pays_for_Access_Other,Who_Pays_for_Access_Parents,Who_Pays_for_Access_School,Who_Pays_for_Access_Self
count,10108,10108,10108,10108,10108,10108,10108,10108,10108,10108,...,10108,10108,10108,10108,10108,10108,10108,10108,10108,10108
unique,46,77,4,2,2,2,2,2,2,2,...,2,4,6,3,3,2,2,2,2,2
top,Other,26,More,0,0,0,0,0,0,0,...,0,Yes,Heterosexual,Yes,No,0,0,0,0,1
freq,2878,325,4596,7362,5577,7646,8228,9147,6923,9408,...,8400,8374,8669,6888,5429,10059,9776,9525,8768,7165


We check for duplicate rows.

In [8]:
X.duplicated().sum()

0

## Study of NA's

In [9]:
X.isna().sum().sort_values(ascending = False)

Primary_Computing_Platform                  2699
Actual_Time                                    0
Not_Purchasing_Company_policy                  0
Not_Purchasing_Not_applicable                  0
Not_Purchasing_No_credit                       0
                                            ... 
How_You_Heard_About_Survey_Search_Engine       0
How_You_Heard_About_Survey_Usenet_News         0
How_You_Heard_About_Survey_WWW_Page            0
Major_Geographical_Location                    0
Who_Pays_for_Access_Self                       0
Length: 68, dtype: int64

As can be seen, there are two features with np.nan values. However, this does not mean that there are no more variables containing missing values.

In [10]:
(X=='Not_Say').any().sum()

7

In [11]:
(X=="Dont_Know").any().sum()

3

#### Not_Say

In [12]:
not_say_columns = X.columns[np.unique(np.where(X.eq('Not_Say'))[1])]
not_say_columns

Index(['Age', 'Falsification_of_Information', 'Household_Income',
       'Marital_Status', 'Primary_Language', 'Race', 'Registered_to_Vote'],
      dtype='object')

We convert these values into NaN.

In [13]:
X[not_say_columns] = X[not_say_columns].replace("Not_Say", np.nan)

In [14]:
np.unique(np.where(X.eq('Not_Say'))[1])

array([], dtype=int64)

#### Don't know

In [15]:
X.columns[np.unique(np.where(X.eq('Dont_Know'))[1])]

Index(['Community_Building', 'Most_Import_Issue_Facing_the_Internet',
       'Primary_Computing_Platform'],
      dtype='object')

We leave them as another category.

Final percentage of NAs in the dataset:

In [16]:
(X.isna().sum().sort_values(ascending = False).sum() / (X.shape[0]*X.shape[1]))*100

0.8157487371679975

## Type of Variables

Due to the information in OpenML, we converte the **Age** variable in categorical.

In [17]:
X['Age'] = X['Age'].astype(str)

Also, we convert **Gender** variable in numerical binary.

In [18]:
X['Gender'].value_counts()

Gender
Male      6217
Female    3891
Name: count, dtype: int64

In [19]:
X['Gender'] = X['Gender'].map({'Male':0, 'Female':1}).astype(float)

We see which and how many variables contain only the values {"0", "1"}.

In [20]:
variables_01 = X.columns[X.isin(['0', '1',]).all()]
len(variables_01)

47

In [21]:
for col in variables_01:
    X[col] = X[col].astype(float)

Variable types:

In [22]:
num_columns = X.select_dtypes(include=['number']).columns
len(num_columns)

48

In [23]:
cat_columns = X.select_dtypes(include=['category','object']).columns
len(cat_columns)

20

Cardinality of the 20 variables with the highest number of unique values.

In [24]:
X[cat_columns].apply(lambda col: col.nunique()).sort_values(ascending=False)[0:20]

Country                                  129
Primary_Language                         118
Age                                       77
Actual_Time                               46
Primary_Computing_Platform                11
Major_Geographical_Location               10
Primary_Place_of_WWW_Access                9
Most_Import_Issue_Facing_the_Internet      9
Education_Attainment                       9
Household_Income                           8
Race                                       7
Marital_Status                             6
Falsification_of_Information               6
Sexual_Preference                          6
Major_Occupation                           5
Opinions_on_Censorship                     4
Community_Building                         4
Registered_to_Vote                         3
Web_Ordering                               3
Web_Page_Creation                          3
dtype: int64

## Value counts of the variables with more cardinality

#### Country

In [25]:
X.Country.value_counts()[0:10]

Country
California      1021
New_York         603
Texas            494
Florida          474
Georgia          431
Pennsylvania     393
Ohio             318
Illinois         318
Michigan         307
Washington       301
Name: count, dtype: int64

#### Primary Language

In [26]:
X.Primary_Language.value_counts()[0:10]

Primary_Language
English       9412
German          95
French          74
Dutch           73
Spanish         72
Chinese         32
Swedish         30
Italian         24
Norwegian       22
Portuguese      17
Name: count, dtype: int64

#### Actual Time

In [27]:
X.Actual_Time.value_counts()[0:10]

Actual_Time
Other                          2878
College_Student                1458
Retired                         494
Manager                         490
Programmer                      477
Service_Industry_Occupation     327
K-12_Student                    326
Homemaker                       253
Administrator/Secretary         231
Engineer                        226
Name: count, dtype: int64

## Response variable distribution

In [28]:
y.value_counts()

Who_Pays_for_Access_Work
0    7393
1    2715
Name: count, dtype: int64

In [29]:
y.value_counts(normalize=True)

Who_Pays_for_Access_Work
0    0.731401
1    0.268599
Name: proportion, dtype: float64

## Train-Test Split

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size = 0.33, 
                                                    random_state = 42,
                                                    stratify = y)

In [32]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape:  {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape:  {y_test.shape}')

X_train shape: (6772, 68)
X_test shape:  (3336, 68)
y_train shape: (6772,)
y_test shape:  (3336,)


## Pipelines

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score

import scipy.stats
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

import time

In [34]:
num_cols = X_train.select_dtypes(include=['number']).columns.to_list()
cat_cols = X_train.select_dtypes(include=['category']).columns.to_list()

cat_cols_less9 = [col for col in cat_cols if X[col].nunique() <= 9]
cat_cols_more9 = [col for col in cat_cols if X[col].nunique() > 9]

In [35]:
# Define the HistGradientBoostingClassifier
hgb_default = HistGradientBoostingClassifier(max_iter=100, random_state=1234,
                                             early_stopping=True,
                                             scoring='balanced_accuracy',
                                             validation_fraction=0.1,
                                             n_iter_no_change=5,
                                             class_weight='balanced')

hgb_default9 = HistGradientBoostingClassifier(max_iter=1000, random_state=1234,
                                              early_stopping=True,
                                              scoring='balanced_accuracy',
                                              validation_fraction=0.1,
                                              n_iter_no_change=5,
                                              class_weight='balanced')


# Define the hyperparameter search space
param_distributions = {
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
    'model__min_samples_leaf': scipy.stats.randint(1, 10),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

### One Hot Encoding + HistGradientBoosting

#### Preprocessing

In [36]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot", cat_pipeline, cat_cols)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [37]:
ohe_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                     ('model', hgb_default)])

In [38]:
tic = time.time()

ohe_hgb_default = ohe_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ohe_hgb_default_time_taken = toc-tic

In [39]:
# Display pipeline
print("Time taken: ", ohe_hgb_default_time_taken)
ohe_hgb_default

Time taken:  1.3906311988830566


In [40]:
# # Calculate balanced accuracy for the model with default parameters
y_ohe_hgb_default_pred = ohe_hgb_default.predict(X_test)
ohe_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ohe_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_default_accuracy)
time_summary.append(ohe_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8641009221311475




#### Create a HistGradientBoostingClassifier model for tuning

In [41]:
ohe_hgb_tune = RandomizedSearchCV(estimator = ohe_hgb_default_pipeline, 
                                  param_distributions = param_distributions, 
                                  n_iter = 100,
                                  cv = stratified_kfold,
                                  scoring = 'balanced_accuracy', 
                                  random_state = 1234,
                                  n_jobs = -1)

In [42]:
tic = time.time()

ohe_hgb_tune = ohe_hgb_tune.fit(X_train, y_train)

toc = time.time()
ohe_hgb_tune_time_taken = toc-tic

In [43]:
# Display pipeline
print("Time taken: ", ohe_hgb_tune_time_taken)
ohe_hgb_tune

Time taken:  63.43437719345093


In [44]:
# Predict using the model with the best parameters
y_ohe_hgb_tune_pred = ohe_hgb_tune.predict(X_test)

# Get the best parameters
ohe_hgb_tune_best_params = ohe_hgb_tune.best_params_
print(f'Best parameters: {ohe_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ohe_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ohe_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ohe_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OneHotEncoding")
value_summary.append(ohe_hgb_tune_accuracy)
time_summary.append(ohe_hgb_tune_time_taken)
n_models_summary.append(ohe_hgb_tune.n_iter * ohe_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.2956586229446432, 'model__min_samples_leaf': 9}
Balanced accuracy with best parameters: 0.8811914519906323




### Count Encoder + HistGradientBoosting


In [45]:
from category_encoders.count import CountEncoder

#### Preprocessing

In [46]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", CountEncoder())
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("count_encoder", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("count_encoder", cat_pipeline_more9, cat_cols_more9),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [47]:
count_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])
count_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                        ('model', hgb_default9)])

In [48]:
tic = time.time()

count_hgb_default = count_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
count_hgb_default_time_taken = toc-tic

In [49]:
# Display pipeline
print("Time taken: ", count_hgb_default_time_taken)
count_hgb_default

Time taken:  0.7671315670013428


In [50]:
tic = time.time()

count_hgb_default9 = count_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
count_hgb_default9_time_taken = toc-tic

In [51]:
# Display pipeline
print("Time taken: ", count_hgb_default9_time_taken)
count_hgb_default9

Time taken:  0.870025634765625


In [52]:
# Calculate balanced accuracy for the model with default parameters
y_count_hgb_default_pred = count_hgb_default.predict(X_test)
count_hgb_default_accuracy = balanced_accuracy_score(y_test, y_count_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy)
time_summary.append(count_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8717524151053864


In [53]:
# Calculate balanced accuracy for the model with default parameters
y_count_hgb_default_pred9 = count_hgb_default9.predict(X_test)
count_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_count_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {count_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_default_accuracy9)
time_summary.append(count_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8707278249414521


#### Create a HistGradientBoostingClassifier model for tuning

In [54]:
count_hgb_tune = RandomizedSearchCV(estimator = count_hgb_default_pipeline, 
                                   param_distributions = param_distributions, 
                                   n_iter = 200,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

count_hgb_tune9 = RandomizedSearchCV(estimator = count_hgb_default_pipeline9, 
                                     param_distributions = param_distributions, 
                                     n_iter = 200,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

In [55]:
tic = time.time()

count_hgb_tune = count_hgb_tune.fit(X_train, y_train)

toc = time.time()
count_hgb_tune_time_taken = toc-tic

In [56]:
# Display pipeline
print("Time taken: ", count_hgb_tune_time_taken)
count_hgb_tune

Time taken:  58.264779806137085


In [57]:
tic = time.time()

count_hgb_tune9 = count_hgb_tune9.fit(X_train, y_train)

toc = time.time()
count_hgb_tune9_time_taken = toc-tic

In [58]:
# Display pipeline
print("Time taken: ", count_hgb_tune9_time_taken)
count_hgb_tune9

Time taken:  59.459977865219116


In [59]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred = count_hgb_tune.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params = count_hgb_tune.best_params_
print(f'Best parameters: {count_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_count_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy)
time_summary.append(count_hgb_tune_time_taken)
n_models_summary.append(count_hgb_tune.n_iter * count_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.16756338721861988, 'model__min_samples_leaf': 9}
Balanced accuracy with best parameters: 0.8730386416861826


In [60]:
# Predict using the model with the best parameters
y_count_hgb_tune_pred9 = count_hgb_tune9.predict(X_test)

# Get the best parameters
count_hgb_tune_best_params9 = count_hgb_tune9.best_params_
print(f'Best parameters: {count_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
count_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_count_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {count_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CountEncoding")
value_summary.append(count_hgb_tune_accuracy9)
time_summary.append(count_hgb_tune9_time_taken)
n_models_summary.append(count_hgb_tune9.n_iter * count_hgb_tune9.n_splits_)

Best parameters: {'model__learning_rate': 0.2669432371178712, 'model__min_samples_leaf': 7}
Balanced accuracy with best parameters: 0.8733917593676815


### Ordinal Encoding + HistGradientBoosting

In [61]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [62]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999))
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("ordinal_encoder", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("ordinal_encoder", cat_pipeline_more9, cat_cols_more9),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9)],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [63]:
ordinal_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])

ordinal_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                       ('model', hgb_default9)])

In [64]:
tic = time.time()

ordinal_hgb_default = ordinal_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default_time_taken = toc-tic

In [65]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default_time_taken)
ordinal_hgb_default

Time taken:  0.6909303665161133


In [66]:
tic = time.time()

ordinal_hgb_default9 = ordinal_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_default9_time_taken = toc-tic

In [67]:
# Display pipeline
print("Time taken: ", ordinal_hgb_default9_time_taken)
ordinal_hgb_default9

Time taken:  2.219041585922241


In [68]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred = ordinal_hgb_default.predict(X_test)
ordinal_hgb_default_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy)
time_summary.append(ordinal_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8710809426229509


In [69]:
# Calculate balanced accuracy for the model with default parameters
y_ordinal_hgb_default_pred9 = ordinal_hgb_default9.predict(X_test)
ordinal_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_ordinal_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {ordinal_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_default_accuracy9)
time_summary.append(ordinal_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8799967066744732


#### Create a HistGradientBoostingClassifier model for tuning

In [70]:
ordinal_hgb_tune = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline, 
                                   param_distributions = param_distributions, 
                                   n_iter = 200,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

ordinal_hgb_tune9 = RandomizedSearchCV(estimator = ordinal_hgb_default_pipeline9, 
                                    param_distributions = param_distributions, 
                                    n_iter = 200,
                                    cv = stratified_kfold,
                                    scoring = 'balanced_accuracy', 
                                    random_state = 1234,
                                    n_jobs = -1)

In [71]:
tic = time.time()

ordinal_hgb_tune = ordinal_hgb_tune.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune_time_taken = toc-tic

In [72]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune_time_taken)
ordinal_hgb_tune

Time taken:  46.68486833572388


In [73]:
tic = time.time()

ordinal_hgb_tune9 = ordinal_hgb_tune9.fit(X_train, y_train)

toc = time.time()
ordinal_hgb_tune9_time_taken = toc-tic

In [74]:
# Display pipeline
print("Time taken: ", ordinal_hgb_tune9_time_taken)
ordinal_hgb_tune9

Time taken:  51.843918323516846


In [75]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred = ordinal_hgb_tune.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params = ordinal_hgb_tune.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy)
time_summary.append(ordinal_hgb_tune_time_taken)
n_models_summary.append(ordinal_hgb_tune.n_iter * ordinal_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.28520634060597605, 'model__min_samples_leaf': 5}
Balanced accuracy with best parameters: 0.87618193793911


In [76]:
# Predict using the model with the best parameters
y_ordinal_hgb_tune_pred9 = ordinal_hgb_tune9.predict(X_test)

# Get the best parameters
ordinal_hgb_tune_best_params9 = ordinal_hgb_tune9.best_params_
print(f'Best parameters: {ordinal_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
ordinal_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_ordinal_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {ordinal_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("OrdinalEncoder")
value_summary.append(ordinal_hgb_tune_accuracy9)
time_summary.append(ordinal_hgb_tune9_time_taken)
n_models_summary.append(ordinal_hgb_tune9.n_iter * ordinal_hgb_tune9.n_splits_)

Best parameters: {'model__learning_rate': 0.2444698195876226, 'model__min_samples_leaf': 7}
Balanced accuracy with best parameters: 0.8777298009367682


### Native HistGradientBoosting support for categorical variables


In [77]:
from sklearn.preprocessing import OrdinalEncoder

#### Preprocessing

In [78]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OrdinalEncoder(dtype = int,
                               handle_unknown = 'use_encoded_value',
                               unknown_value = 99999,
                               encoded_missing_value = 99999,
                               max_categories = 254))
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("ordinal_encoding", cat_pipeline_more9, cat_cols_more9)
                    ],
    sparse_threshold=0
)

In [79]:
category_features_for_nativesupport = [False]*len(num_cols) + [True]*len(cat_cols)

X_train_check = preprop_pipeline9.fit_transform(X_train)
category_features_for_nativesupport9 = [False]*(X_train_check.shape[1]-len(cat_cols_more9)) + [True]*len(cat_cols_more9)

In [80]:
hgb_default_categories_support = HistGradientBoostingClassifier(max_iter=1000, random_state=1234,
                                                                early_stopping=True,
                                                                scoring='balanced_accuracy',
                                                                validation_fraction=0.1,
                                                                n_iter_no_change=5,
                                                                class_weight='balanced',
                                                                categorical_features=category_features_for_nativesupport)

hgb_default_categories_support9 = HistGradientBoostingClassifier(max_iter=1000, random_state=1234,
                                                                 early_stopping=True,
                                                                 scoring='balanced_accuracy',
                                                                 validation_fraction=0.1,
                                                                 n_iter_no_change=5,
                                                                 class_weight='balanced',
                                                                 categorical_features=category_features_for_nativesupport9)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [81]:
catsup_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', hgb_default_categories_support)])

catsup_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                        ('model', hgb_default_categories_support9)])

In [82]:
tic = time.time()

catsup_hgb_default = catsup_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default_time_taken = toc-tic

In [83]:
# Display pipeline
print("Time taken: ", catsup_hgb_default_time_taken)
catsup_hgb_default

Time taken:  0.5931806564331055


In [84]:
tic = time.time()

catsup_hgb_default9 = catsup_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
catsup_hgb_default9_time_taken = toc-tic

In [85]:
# Display pipeline
print("Time taken: ", catsup_hgb_default9_time_taken)
catsup_hgb_default9

Time taken:  0.6893339157104492


In [86]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred = catsup_hgb_default.predict(X_test)
catsup_hgb_default_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy)
time_summary.append(catsup_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8694415983606558


In [87]:
# Calculate balanced accuracy for the model with default parameters
y_catsup_hgb_default_pred9 = catsup_hgb_default9.predict(X_test)
catsup_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_catsup_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {catsup_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_default_accuracy9)
time_summary.append(catsup_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.871150468384075


#### Create a HistGradientBoostingClassifier model for tuning

In [88]:
catsup_hgb_tune = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline, 
                                   param_distributions = param_distributions, 
                                   n_iter = 200,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

catsup_hgb_tune9 = RandomizedSearchCV(estimator = catsup_hgb_default_pipeline9, 
                                   param_distributions = param_distributions, 
                                   n_iter = 200,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

In [89]:
tic = time.time() 

catsup_hgb_tune = catsup_hgb_tune.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune_time_taken = toc-tic

In [90]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune_time_taken)
catsup_hgb_tune

Time taken:  45.83363652229309


In [91]:
tic = time.time() 

catsup_hgb_tune9 = catsup_hgb_tune9.fit(X_train, y_train)

toc = time.time()
catsup_hgb_tune9_time_taken = toc-tic

In [92]:
# Display pipeline
print("Time taken: ", catsup_hgb_tune9_time_taken)
catsup_hgb_tune9

Time taken:  52.047276735305786


In [93]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred = catsup_hgb_tune.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params = catsup_hgb_tune.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy)
time_summary.append(catsup_hgb_tune_time_taken)
n_models_summary.append(catsup_hgb_tune.n_iter * catsup_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.20726553314734583, 'model__min_samples_leaf': 5}
Balanced accuracy with best parameters: 0.8758068647540984


In [94]:
# Predict using the model with the best parameters
y_catsup_hgb_tune_pred9 = catsup_hgb_tune9.predict(X_test)

# Get the best parameters
catsup_hgb_tune_best_params9 = catsup_hgb_tune9.best_params_
print(f'Best parameters: {catsup_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
catsup_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_catsup_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {catsup_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("HGB_NativeSupport")
value_summary.append(catsup_hgb_tune_accuracy9)
time_summary.append(catsup_hgb_tune9_time_taken)
n_models_summary.append(catsup_hgb_tune9.n_iter * catsup_hgb_tune9.n_splits_)

Best parameters: {'model__learning_rate': 0.14558572542745005, 'model__min_samples_leaf': 5}
Balanced accuracy with best parameters: 0.878322599531616


### Target Encoder (scikit-learn)

In [95]:
from sklearn.preprocessing import TargetEncoder

#### Preprocessing

In [96]:
cat_pipeline_more9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", TargetEncoder())
])

cat_pipeline_less9 = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat", cat_pipeline_more9, cat_cols)],
    sparse_threshold=0
)

preprop_pipeline9 = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("one_hot_encoding", cat_pipeline_less9, cat_cols_less9),
                    ("ordinal_encoding", cat_pipeline_more9, cat_cols_more9)
                    ],
    sparse_threshold=0
)

#### Create a HistGradientBoostingClassifier model with default parameters and early stopping

In [97]:
target_hgb_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                       ('model', hgb_default)])

target_hgb_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline9),
                                       ('model', hgb_default9)])

In [98]:
tic = time.time()

target_hgb_default = target_hgb_default_pipeline.fit(X_train, y_train)

toc = time.time()
target_hgb_default_time_taken = toc-tic

In [99]:
# Display pipeline
print("Time taken: ", target_hgb_default_time_taken)
target_hgb_default

Time taken:  1.3524248600006104


In [100]:
tic = time.time()

target_hgb_default9 = target_hgb_default_pipeline9.fit(X_train, y_train)

toc = time.time()
target_hgb_default9_time_taken = toc-tic

In [101]:
# Display pipeline
print("Time taken: ", target_hgb_default9_time_taken)
target_hgb_default9

Time taken:  0.8164570331573486


In [102]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred = target_hgb_default.predict(X_test)
target_hgb_default_accuracy = balanced_accuracy_score(y_test, y_target_hgb_default_pred)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy)
time_summary.append(target_hgb_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8789373536299766


In [103]:
# Calculate balanced accuracy for the model with default parameters
y_target_hgb_default_pred9 = target_hgb_default9.predict(X_test)
target_hgb_default_accuracy9 = balanced_accuracy_score(y_test, y_target_hgb_default_pred9)
print(f'Balanced accuracy with default parameters: {target_hgb_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_default_accuracy9)
time_summary.append(target_hgb_default9_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8770144174473068


In [104]:
target_hgb_tune = RandomizedSearchCV(estimator = target_hgb_default_pipeline, 
                                     param_distributions = param_distributions, 
                                     n_iter = 200,
                                     cv = stratified_kfold,
                                     scoring = 'balanced_accuracy', 
                                     random_state = 1234,
                                     n_jobs = -1)

target_hgb_tune9 = RandomizedSearchCV(estimator = target_hgb_default_pipeline9, 
                                      param_distributions = param_distributions, 
                                      n_iter = 200,
                                      cv = stratified_kfold,
                                      scoring = 'balanced_accuracy', 
                                      random_state = 1234,
                                      n_jobs = -1)

In [105]:
tic = time.time() 

target_hgb_tune = target_hgb_tune.fit(X_train, y_train)

toc = time.time()
target_hgb_tune_time_taken = toc-tic

In [106]:
# Display pipeline
print("Time taken: ", target_hgb_tune_time_taken)
target_hgb_tune

Time taken:  41.34133720397949


In [107]:
tic = time.time() 

target_hgb_tune9 = target_hgb_tune9.fit(X_train, y_train)

toc = time.time()
target_hgb_tune9_time_taken = toc-tic

In [108]:
# Display pipeline
print("Time taken: ", target_hgb_tune9_time_taken)
target_hgb_tune9

Time taken:  52.79973220825195


In [109]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred = target_hgb_tune.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params = target_hgb_tune.best_params_
print(f'Best parameters: {target_hgb_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy = balanced_accuracy_score(y_test, y_target_hgb_tune_pred)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy)
time_summary.append(target_hgb_tune_time_taken)
n_models_summary.append(target_hgb_tune.n_iter * target_hgb_tune.n_splits_)

Best parameters: {'model__learning_rate': 0.20766204659446258, 'model__min_samples_leaf': 6}
Balanced accuracy with best parameters: 0.8814878512880562


In [110]:
# Predict using the model with the best parameters
y_target_hgb_tune_pred9 = target_hgb_tune9.predict(X_test)

# Get the best parameters
target_hgb_tune_best_params9 = target_hgb_tune9.best_params_
print(f'Best parameters: {target_hgb_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
target_hgb_tune_accuracy9 = balanced_accuracy_score(y_test, y_target_hgb_tune_pred9)
print(f'Balanced accuracy with best parameters: {target_hgb_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("TargetEncoder")
value_summary.append(target_hgb_tune_accuracy9)
time_summary.append(target_hgb_tune9_time_taken)
n_models_summary.append(target_hgb_tune9.n_iter * target_hgb_tune9.n_splits_)

Best parameters: {'model__learning_rate': 0.1824028794421748, 'model__min_samples_leaf': 7}
Balanced accuracy with best parameters: 0.8829350848946136


### CatBoost

In [111]:
from catboost import CatBoostClassifier

#### Preprocessing

In [112]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

preprop_pipeline = ColumnTransformer(
    transformers = [("num", num_pipeline, num_cols),
                    ("cat_less9", cat_pipeline, cat_cols_less9),
                    ("cat_more9", cat_pipeline, cat_cols_more9)],
    sparse_threshold=0
)

In [113]:
category_features_for_catboostsupport = [index for index in range(len(num_cols), len(num_cols) + len(cat_cols))]
category_features_for_catboostsupport9 = [index for index in range(len(num_cols) + len(cat_cols_less9), len(num_cols) + len(cat_cols))]
print(category_features_for_catboostsupport)
print(category_features_for_catboostsupport9)

[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66]
[62, 63, 64, 65, 66]


Catboost allows to give a maximum value of unique categories for which a variable is encoded or not by One-Hot-Encoder.

In [114]:
# Create catboost models
catboost_default_raw = CatBoostClassifier(iterations=1000,
                                        eval_metric = 'BalancedAccuracy',
                                        loss_function = 'Logloss',
                                        auto_class_weights = 'Balanced',
                                        early_stopping_rounds=5,
                                        od_type='Iter',
                                        one_hot_max_size = 0,
                                        random_seed = 1234,
                                        verbose = False)

catboost_default9_raw = CatBoostClassifier(iterations=1000,
                                        eval_metric = 'BalancedAccuracy',
                                        loss_function = 'Logloss',
                                        auto_class_weights = 'Balanced',
                                        early_stopping_rounds=5,
                                        od_type='Iter',
                                        one_hot_max_size = 9,
                                        random_seed = 1234,
                                        verbose = False)


catboost_default_raw.set_params(cat_features=category_features_for_catboostsupport)
catboost_default9_raw.set_params(cat_features=category_features_for_catboostsupport9)


# Default CatBoostClassifier Pipeline
catboost_default_pipeline = Pipeline([('preprocessing', preprop_pipeline),
                                        ('model', catboost_default_raw)])

catboost_default_pipeline9 = Pipeline([('preprocessing', preprop_pipeline),
                                         ('model', catboost_default9_raw)])

# Define the hyperparameter search space
catboost_param_distributions = {
    'model__iterations': scipy.stats.randint(10, 1000),
    'model__depth': scipy.stats.randint(4,11),
    'model__learning_rate': scipy.stats.uniform(0.01, 0.3),
}

# Create a StratifiedKFold cross-validation instance
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

catboost_tune_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                   param_distributions = catboost_param_distributions, 
                                   n_iter = 5,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)


catboost_tune9_raw = RandomizedSearchCV(estimator = catboost_default_pipeline, 
                                   param_distributions = catboost_param_distributions, 
                                   n_iter = 5,
                                   cv = stratified_kfold,
                                   scoring = 'balanced_accuracy', 
                                   random_state = 1234,
                                   n_jobs = -1)

In [115]:
tic = time.time()

catboost_default = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken = toc-tic

In [116]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken)
catboost_default

Time taken:  38.97459363937378


In [117]:
tic = time.time()

catboost_default9 = catboost_default_pipeline.fit(X_train, y_train)

toc = time.time()
catboost_default_time_taken9 = toc-tic

In [118]:
# Display pipeline
print("Time taken: ", catboost_default_time_taken9)
catboost_default9

Time taken:  39.02650856971741


In [119]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred = catboost_default.predict(X_test)
catboost_default_accuracy = balanced_accuracy_score(y_test, y_catboost_default_pred)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy}')

# Save results
default_summary.append("Default")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy)
time_summary.append(catboost_default_time_taken)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8898108167447307


In [120]:
# Calculate balanced accuracy for the model with default parameters
y_catboost_default_pred9 = catboost_default9.predict(X_test)
catboost_default_accuracy9 = balanced_accuracy_score(y_test, y_catboost_default_pred9)
print(f'Balanced accuracy with default parameters: {catboost_default_accuracy9}')

# Save results
default_summary.append("Default")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_default_accuracy9)
time_summary.append(catboost_default_time_taken9)
n_models_summary.append(1)

Balanced accuracy with default parameters: 0.8898108167447307


In [121]:
tic = time.time()

catboost_tune = catboost_tune_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken = toc-tic

In [122]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken)
catboost_tune

Time taken:  252.64670300483704


In [123]:
tic = time.time()

catboost_tune9 = catboost_tune9_raw.fit(X_train, y_train)

toc = time.time()
catboost_tune_time_taken9 = toc-tic

In [124]:
# Display pipeline
print("Time taken: ", catboost_tune_time_taken9)
catboost_tune9

Time taken:  258.6848657131195


In [125]:
# Predict using the model with the best parameters
y_catboost_tune_pred = catboost_tune.predict(X_test)

# Get the best parameters
catboost_tune_best_params = catboost_tune.best_params_
print(f'Best parameters: {catboost_tune_best_params}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy = balanced_accuracy_score(y_test, y_catboost_tune_pred)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy}')

# Save results
default_summary.append("Tune")
card_9_summary.append("AllVariables")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy)
time_summary.append(catboost_tune_time_taken)
n_models_summary.append(catboost_tune.n_iter * catboost_tune.n_splits_)

Best parameters: {'model__depth': 5, 'model__iterations': 289, 'model__learning_rate': 0.05519108968182859}
Balanced accuracy with best parameters: 0.8883544350117096


In [126]:
# Predict using the model with the best parameters
y_catboost_tune_pred9 = catboost_tune9.predict(X_test)

# Get the best parameters
catboost_tune_best_params9 = catboost_tune9.best_params_
print(f'Best parameters: {catboost_tune_best_params9}')

# Calculate balanced accuracy for the model with the best parameters
catboost_tune_accuracy9 = balanced_accuracy_score(y_test, y_catboost_tune_pred9)
print(f'Balanced accuracy with best parameters: {catboost_tune_accuracy9}')

# Save results
default_summary.append("Tune")
card_9_summary.append("OnlyVariablesWithCard>9")
encoder_summary.append("CatboostNativeSupport")
value_summary.append(catboost_tune_accuracy9)
time_summary.append(catboost_tune_time_taken9)
n_models_summary.append(catboost_tune9.n_iter * catboost_tune9.n_splits_)

Best parameters: {'model__depth': 5, 'model__iterations': 289, 'model__learning_rate': 0.05519108968182859}
Balanced accuracy with best parameters: 0.8883544350117096


### Results Summary

In [127]:
results_summary = pd.DataFrame({"Dataset":"kdd_internet_usage",
                                "Variables":card_9_summary,
                                "Default/Tune":default_summary,
                                "Encoder":encoder_summary,
                                "Metric":"BalancedAccuracy",
                                "Value":value_summary,
                                "Time":time_summary,
                                "n_Models":n_models_summary})
results_summary["mean_Time"] = (results_summary["Time"] / results_summary["n_Models"])
results_summary

Unnamed: 0,Dataset,Variables,Default/Tune,Encoder,Metric,Value,Time,n_Models,mean_Time
0,kdd_internet_usage,AllVariables,Default,OneHotEncoding,BalancedAccuracy,0.864101,1.390631,1,1.390631
1,kdd_internet_usage,AllVariables,Tune,OneHotEncoding,BalancedAccuracy,0.881191,63.434377,300,0.211448
2,kdd_internet_usage,AllVariables,Default,CountEncoding,BalancedAccuracy,0.871752,0.767132,1,0.767132
3,kdd_internet_usage,OnlyVariablesWithCard>9,Default,CountEncoding,BalancedAccuracy,0.870728,0.870026,1,0.870026
4,kdd_internet_usage,AllVariables,Tune,CountEncoding,BalancedAccuracy,0.873039,58.26478,600,0.097108
5,kdd_internet_usage,OnlyVariablesWithCard>9,Tune,CountEncoding,BalancedAccuracy,0.873392,59.459978,600,0.0991
6,kdd_internet_usage,AllVariables,Default,OrdinalEncoder,BalancedAccuracy,0.871081,0.69093,1,0.69093
7,kdd_internet_usage,OnlyVariablesWithCard>9,Default,OrdinalEncoder,BalancedAccuracy,0.879997,2.219042,1,2.219042
8,kdd_internet_usage,AllVariables,Tune,OrdinalEncoder,BalancedAccuracy,0.876182,46.684868,600,0.077808
9,kdd_internet_usage,OnlyVariablesWithCard>9,Tune,OrdinalEncoder,BalancedAccuracy,0.87773,51.843918,600,0.086407


In [128]:
results_summary.to_csv("kdd_internet_usage_results.csv")