In [33]:
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
import warnings

In [9]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
productivity_prediction_of_garment_employees = fetch_ucirepo(id=597) 
  
# data (as pandas dataframes) 
X = productivity_prediction_of_garment_employees.data.features 
y = productivity_prediction_of_garment_employees.data.targets 
X.drop(['date', 'wip'], axis=1, inplace=True)
X.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(['date', 'wip'], axis=1, inplace=True)


Index(['quarter', 'department', 'day', 'team', 'targeted_productivity', 'smv',
       'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change',
       'no_of_workers'],
      dtype='object')

### XGBoost

In [29]:
xgbm = XGBRegressor(random_state=24)

In [37]:
ohe = OneHotEncoder(handle_unknown='ignore')
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False)
pipe = Pipeline([('CT',ct),('MODEL',xgbm)])
params = {'MODEL__n_estimators':[10,50], 'MODEL__max_depth':[2,3,4], 
          'MODEL__learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.005 total time=   0.1s
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=-0.006 total time=   0.0s
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.004 total time=   0.0s
[CV 4/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=-0.002 total time=   0.0s
[CV 5/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.003 total time=   0.0s
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.025 total time=   0.0s
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.010 total time=   0.0s
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.024 total time=   0.0s


In [39]:
print(gcv.best_params_)
print(gcv.best_score_)

{'MODEL__learning_rate': 0.25075, 'MODEL__max_depth': 3, 'MODEL__n_estimators': 50}
0.49053003787994387


### Light GBM

In [50]:
from lightgbm import LGBMRegressor

In [52]:
lgbm = LGBMRegressor(random_state=24)
ohe = OneHotEncoder(handle_unknown='ignore')
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False)
pipe = Pipeline([('CT',ct),('MODEL',lgbm)])
params = {'MODEL__n_estimators':[10,50], 'MODEL__max_depth':[2,3,4], 
          'MODEL__learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.005 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimator

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 5/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.003 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.025 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__lear

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=3, MODEL__n_estimators=50;, score=0.017 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.735991
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=3, MODEL__n_estimators=50;, score=0.035 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.020 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.735991
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=4, MODEL__n_estimat

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END MODEL__learning_rate=0.25075, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.568 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__learning_rate=0.25075, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.445 total time=   0.0s
[LightGBM] [Info] Au

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END MODEL__learning_rate=0.25075, MODEL__max_depth=4, MODEL__n_estimators=10;, score=0.573 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__learning_rate=0.25075, MODEL__max_depth=4, MODEL__n_estimators=10;, score=0.441 total time=   0.0s
[LightGBM] [Info] Au

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 5/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.414 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=3, MODEL__n_estimators=10;, score=0.571 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 4/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=3, MODEL__n_estimators=50;, score=0.488 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.734297
[CV 5/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=3, MODEL__n_estimators=50;, score=0.370 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=4, MODEL__n_estimators=10;,

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 246
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.737910
[CV 4/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.445 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.734297
[CV 5/5] END MODEL__learning_rate=0.5005, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.349 total time=   0.0s
[LightGBM] [Info] Auto

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 246
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.737910
[CV 4/5] END MODEL__learning_rate=0.75025, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.526 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.734297
[CV 5/5] END MODEL__learning_rate=0.75025, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.403 total time=   0.0s
[LightGBM] [Info] Au

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 3/5] END MODEL__learning_rate=0.75025, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.423 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 246
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.737910
[CV 4/5] END MODEL__learning_rate=0.75025, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.365 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.734297
[CV 5/5] END MODEL__

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 2/5] END MODEL__learning_rate=1.0, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.378 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.735991
[CV 3/5] END MODEL__learning_rate=1.0, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.418 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 246
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 2/5] END MODEL__learning_rate=1.0, MODEL__max_depth=3, MODEL__n_estimators=50;, score=0.342 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.735991
[CV 3/5] END MODEL__learning_rate=1.0, MODEL__max_depth=3, MODEL__n_estimators=50;, score=0.429 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 246
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.737910
[CV 4/5] END MODEL__learning

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 1/5] END MODEL__learning_rate=1.0, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.560 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END MODEL__learning_rate=1.0, MODEL__max_depth=4, MODEL__n_estimators=50;, score=0.288 total time=   0.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 20
[LightGBM] [Info] Start training from score 0.735991
[CV 3/5] END MODEL__learning

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [54]:
print(gcv.best_params_)
print(gcv.best_score_)

{'MODEL__learning_rate': 0.25075, 'MODEL__max_depth': 4, 'MODEL__n_estimators': 50}
0.49776817407312857


### Cat Boost

In [59]:
from catboost import CatBoostRegressor
cgbm = CatBoostRegressor(random_state=24)
ohe = OneHotEncoder(handle_unknown='ignore')
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False)
pipe = Pipeline([('CT',ct),('MODEL',cgbm)])
params = {'MODEL__n_estimators':[10,50], 'MODEL__max_depth':[2,3,4], 
          'MODEL__learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.1734899	total: 140ms	remaining: 1.26s
1:	learn: 0.1734589	total: 141ms	remaining: 563ms
2:	learn: 0.1734299	total: 141ms	remaining: 330ms
3:	learn: 0.1733990	total: 142ms	remaining: 214ms
4:	learn: 0.1733682	total: 143ms	remaining: 143ms
5:	learn: 0.1733460	total: 144ms	remaining: 95.7ms
6:	learn: 0.1733126	total: 144ms	remaining: 61.7ms
7:	learn: 0.1732836	total: 144ms	remaining: 36.1ms
8:	learn: 0.1732626	total: 145ms	remaining: 16.1ms
9:	learn: 0.1732291	total: 145ms	remaining: 0us
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.004 total time=   0.1s
0:	learn: 0.1743398	total: 585us	remaining: 5.27ms
1:	learn: 0.1743015	total: 981us	remaining: 3.92ms
2:	learn: 0.1742623	total: 1.34ms	remaining: 3.14ms
3:	learn: 0.1742251	total: 1.7ms	remaining: 2.56ms
4:	learn: 0.1741869	total: 2.07ms	remaining: 2.07ms
5:	learn: 0.1741598	total: 2.5ms	remaining: 1.67ms
6:	learn: 0

In [60]:
print(gcv.best_params_)
print(gcv.best_score_)

{'MODEL__learning_rate': 0.5005, 'MODEL__max_depth': 2, 'MODEL__n_estimators': 50}
0.49363156435038286


### Catboost w/o One Hot Encoding

In [68]:
from catboost import CatBoostRegressor
cgbm = CatBoostRegressor(random_state=24, cat_features=list(X.columns[X.dtypes==object]),
                        verbose=False)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(cgbm, param_grid=params, cv=kfold, scoring='r2',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.004 total time=   0.1s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=-0.007 total time=   0.1s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.003 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=-0.002 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.002 total time=   0.0s
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.020 total time=   0.5s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.007 total time=   0.5s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.018 total time=   0.5s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.012 total time=   0.6s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0

In [70]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.5005, 'max_depth': 4, 'n_estimators': 50}
0.49768646968713837
