# Dataset Preprocessing Sweep
Trying to set up a new template for a notebook that will run a simple 5-fold cross-validation XGBoost model on a variety of dataset permutations.

# Setup

In [1]:
# two manual flags (ex-config)
COLAB = False
USE_GPU = True
libraries = ['xgboost', 'lightgbm', 'catboost', 'widedeep-SAINT']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"dataset_sweep_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [5]:
# handle Google Colab-specific library installation/updating
if COLAB:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
#     !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

#     !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if USE_GPU: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if USE_GPU:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.utils import resample
import seaborn as sns

# from catboost import CatBoostClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.preprocessing import StandardScaler MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from boruta import BorutaPy
from BorutaShap import BorutaShap
import category_encoders as ce

In [28]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/oct2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/home/sf/code/kaggle/tabular_playgrounds/oct2021/')
    datapath = root/'datasets'
    edapath = root/'EDA'
    modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    altdatapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/alt_datasets/')
    studypath = root/'optuna_studies'
    
    for pth in [root, datapath, edapath, modelpath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [8]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

## Ex-Model Config

In [9]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Loading

In [10]:
# loading the full training data set to get the feature correlations to target, for K-means clustering later
df = pd.read_feather(datapath/'train.feather')
# df_corr = df.corr() # getting the correlations of the features
# corr_target = df_corr.loc['target':'target'] # pulling out just the correlation of features with the target, as a 1-row df (for Series, it'd be df_corr.loc['target'])
corr_target = load(altdatapath/'corr_target.joblib')

In [11]:
corr_target.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
target,0.004067,-0.029324,-0.015663,0.036279,0.019811,-0.012301,-0.012332,0.013528,-0.043557,-0.002662,...,-0.00329,-0.003869,-0.004477,-0.004503,-0.004319,-0.004587,-0.002426,-0.005901,-0.0037,1.0


In [12]:
corr_target_x = corr_target.drop('target', axis=1) # dropping the trivial 1.00 autocorrelation
corr_target_abs = abs(corr_target_x) # just interested in magnitudes here
corr_sorted = corr_target_abs.sort_values(by='target', axis=1, ascending=False) # df columns of useful values by correlation with target, will be modified later
y = df.target # pulling out the dependent variable
X = df.drop('target', axis=1) # isolating the independent variables
del df # cleaning up memory
categoricals = [f for f in X.columns if ((1000000 - X[f].nunique()) / 1000000) >=0.9 and X[f].nunique() > 2] # not touching already binary encoded vars

## Parameters

In [13]:
# optuna 20211004, thru 106 trials on unaltered original dataset
params = {
    'n_estimators': 3878,
    'max_depth': 4,
    'learning_rate': 0.024785857161974977,
    'reg_alpha': 26.867682044658245,
    'reg_lambda': 10.839759074147148,
    'subsample': 0.8208581489835881,
    'min_child_weight': 8.829122644339664,
    'colsample_bytree': 0.906420714280384,
    'gamma': 1.472322916021486
}

In [14]:
# b = load(altdatapath/'X_boruta_200iter_filtered_green.joblib')
# type(b)

In [15]:
# bdf = pd.DataFrame(b, index=X.index).join(y)
# bdf.head()

(Following cells generate the correlations for the different feature selections, so that the process need not be repeated each iteration.)

In [16]:
# b = load(altdatapath/'X_boruta_200iter_filtered_green.joblib')
# bdf = pd.DataFrame(b, index=X.index).join(y)
# bdf.head()
# bdf_corr = bdf.corr()
# bdf_corr_target = bdf_corr.loc['target':'target']
# bdf_corr_target_x = bdf_corr_target.drop('target', axis=1) # dropping the trivial 1.00 autocorrelation
# bdf_corr_target_abs = abs(bdf_corr_target_x) # just interested in magnitudes here
# bdf_corr_sorted = bdf_corr_target_abs.sort_values(by='target', axis=1, ascending=False)
# dump(bdf_corr_sorted, altdatapath/'X_boruta_200iter_filtered_green_corr_sorted.joblib')

In [17]:
# del b, bdf, bdf_corr_target, bdf_corr_target_x, bdf_corr_target_abs, bdf_corr_sorted

In [18]:
# b = pd.read_feather(altdatapath/'X_boruta_shap_200trials.feather')
# bdf = b.join(y)
# # bdf.head()
# bdf_corr = bdf.corr()
# bdf_corr_target = bdf_corr.loc['target':'target']
# bdf_corr_target_x = bdf_corr_target.drop('target', axis=1) # dropping the trivial 1.00 autocorrelation
# bdf_corr_target_abs = abs(bdf_corr_target_x) # just interested in magnitudes here
# bdf_corr_sorted = bdf_corr_target_abs.sort_values(by='target', axis=1, ascending=False)
# dump(bdf_corr_sorted, altdatapath/'X_boruta_shap_200trials_corr_sorted.joblib')

In [30]:
# originally from https://www.kaggle.com/satorushibata/optimize-catboost-hyperparameter-with-optuna-gpu
def objective(trial, X=X, y=y, categoricals=categoricals, corr_sorted=corr_sorted):
    # split the (original Kaggle training) data into partitions
    # if study.best_trial:
    #     print("Dumping best params, which are:")
    #     print(str(study.best_trial.params))
    #     dump(study.best_trial.params, filename=datapath/'optuna_catboost_best_20210920.joblib')
    

#     dump(pca60, edapath/'PCA_60.joblibg')

    # use the original 286-feature dataset, or the 136-feature BorutaShap selected one
#     dataset = trial.suggest_categorical('dataset', ['X_orig.feather', 'X_boruta_shap_200trials.feather']) 
#     train_source = altdatapath/'X_orig.feather'
    # train_source = altdatapath/'train-WITH-KMeans_12cluster_kmeans++_maxiter1000_rs42.feather' #'X_boruta_shap_200trials.feather'
#     X = pd.read_feather(path=train_source)
#     y = load(datapath/'y.joblib')
    
#     # decides whether binary-encoded categoricals are encoded or not
#     cardinality_min = trial.suggest_categorical('cardinality_min', [0, 2]) 
        
    encoder_name = trial.suggest_categorical('encoder_name', ['woe', 'catboost', 'james-stein', 'loo', 'mestimate', 'target', 'hashing', None])
    if encoder_name:
        encode_before_kmeans = trial.suggest_categorical('encode_before_kmeans', [True, False]) # determines order
    
    # feature selection setup -- applied before preprocessing
    feature_selection = trial.suggest_categorical('feature_selection', ['BorutaShap', 'Boruta', None])
    k_means_method = trial.suggest_categorical('k_means_method', [25, 50, 100, 'k-means++', None]) # K-Means initialization method

    # now, switch datasets if feature selection is implemented; regardless, prepare appropriate K-Means setup (to be implemented later, in folds)
    if feature_selection: # create a subset of features if appropriate
        if feature_selection == 'BorutaShap':
            X = pd.read_feather(altdatapath/'X_boruta_shap_200trials.feather') # :: pd.DataFrame
            categoricals = [f for f in X.columns if ((1000000 - X[f].nunique()) / 1000000) >=0.9 and X[f].nunique() > 2] # not touching already binary encoded vars
            # k-means cluster feature generation setup 
            if k_means_method:
                corr_sorted = load(altdatapath/'X_boruta_shap_200trials_corr_sorted.joblib') # load prepared correlations
                k_means_clusters = trial.suggest_int('k_means_clusters', 6, 12) # for grabbing the most useful features from `corr_sorted`
                useful_features = list(corr_sorted.columns[:k_means_clusters])
        elif feature_selection == 'Boruta':
            X = pd.DataFrame(load(altdatapath/'X_boruta_200iter_filtered_green.joblib'), index=X.index)
            if k_means_method:
                corr_sorted = load(altdatapath/'X_boruta_200iter_filtered_green_corr_sorted.joblib') # load prepared correlations
                k_means_clusters = trial.suggest_int('k_means_clusters', 6, 12) # for grabbing the most useful features from `corr_sorted`
                useful_features = list(corr_sorted.columns[:k_means_clusters])
            categoricals = [f for f in X.columns if ((1000000 - X[f].nunique()) / 1000000) >=0.9 and X[f].nunique() > 2] # not touching already binary encoded vars
    else:
        if k_means_method:
            k_means_clusters = trial.suggest_int('k_means_clusters', 6, 12) # for grabbing the most useful features from `corr_sorted`
            useful_features = list(corr_sorted.columns[:k_means_clusters])
    
    # define dict of encoders, with names as keys and implementations as values
    encoders = {
        'woe': ce.WOEEncoder(cols=categoricals),
        'catboost': ce.CatBoostEncoder(cols=categoricals),
        'james-stein': ce.JamesSteinEncoder(cols=categoricals),
        'loo': ce.LeaveOneOutEncoder(cols=categoricals),
        'mestimate': ce.MEstimateEncoder(cols=categoricals),
        'target': ce.TargetEncoder(cols=categoricals),
        'hashing': ce.HashingEncoder(cols=categoricals),
    }
    
    # PCA dimensionality reduction setup -- applied at end of preprocessing
    pca_components = trial.suggest_categorical('pca_components', [50, 75, 'mle', None, 'NO'])
    
    # define k-fold splitter
    kfold = KFold(n_splits=5, shuffle=False)
        
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
            
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        print(f"FOLD {fold}")
        print("---------------------------------------------------")
        
        y_train, y_valid = y[train_ids], y[valid_ids] # slicing syntax works on both pandas.Series and numpy.ndarray
        # category_encoders expects pandas.DataFrames
        X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for slicing
        
        # now, apply preprocessing
        if encoder_name: # if categorical encoding to be applied to high cardinality (2<x<100,000) categoricals...
            if k_means_method: # if k-means proceeding
                if encode_before_kmeans: # do category encoding, then clustering
                    # category encoding for high-cardinality categoricals
                    encoder = encoders[encoder_name]
                    X_train = encoder.fit_transform(X_train, y_train)
                    X_valid = encoder.transform(X_valid)

                    # k-means cluster feature generation
                    cluster_cols = [f"cluster{i+1}" for i in range(k_means_clusters)]
                    if k_means_method == 'k-means++':
                        kmeans = KMeans(n_clusters=k_means_clusters, init="k-means++", max_iter=1000, random_state=SEED,n_jobs=-1)
                    else:
                        kmeans = KMeans(n_clusters=k_means_clusters, n_init=k_means_method, max_iter=1000, random_state=SEED, n_jobs=-1)
                    # fit on the training set only
                    X_train_clusters = kmeans.fit_transform(X_train[useful_features])
                    X_valid_clusters = kmeans.transform(X_valid[useful_features])
                    # convert numpy.ndarrays back to properly-labeled pandas.DataFrames
                    X_train_clusters = pd.DataFrame(X_train_clusters, columns=cluster_cols, index=X_train.index)
                    X_valid_clusters = pd.DataFrame(X_valid_clusters, columns=cluster_cols, index=X_valid.index)
                    # join the cluster-distance features to the training and validation sets
                    X_train = X_train.join(X_train_clusters)
                    X_valid = X_valid.join(X_valid_clusters)

                else: # do k-means clustering, then do category encoding
                    cluster_cols = [f"cluster{i+1}" for i in range(k_means_clusters)]
                    if k_means_method == 'k-means++':
                        kmeans = KMeans(n_clusters=k_means_clusters, init="k-means++", max_iter=1000, random_state=SEED,n_jobs=-1)
                    else:
                        kmeans = KMeans(n_clusters=k_means_clusters, n_init=k_means_method, max_iter=1000, random_state=SEED, n_jobs=-1)
                    X_train_clusters = kmeans.fit_transform(X_train[useful_features])
                    X_valid_clusters = kmeans.transform(X_valid[useful_features])
                    X_train_clusters = pd.DataFrame(X_train_clusters, columns=cluster_cols, index=X_train.index)
                    X_valid_clusters = pd.DataFrame(X_valid_clusters, columns=cluster_cols, index=X_valid.index)
                    X_train = X_train.join(X_train_clusters)
                    X_valid = X_valid.join(X_valid_clusters)

                    encoder = encoders[encoder_name]
                    X_train = encoder.fit_transform(X_train, y_train)
                    X_valid = encoder.transform(X_valid)
            
            else: # category encoding, but no k-means
                encoder = encoders[encoder_name]
                X_train = encoder.fit_transform(X_train, y_train)
                X_valid = encoder.transform(X_valid)
                
        else: # no category encoding
            if k_means_method: # if still doing k-means
                # k-means cluster feature generation
                cluster_cols = [f"cluster{i+1}" for i in range(k_means_clusters)]
                if k_means_method == 'k-means++':
                    kmeans = KMeans(n_clusters=k_means_clusters, init="k-means++", max_iter=1000, random_state=SEED,n_jobs=-1)
                else:
                    kmeans = KMeans(n_clusters=k_means_clusters, n_init=k_means_method, max_iter=1000, random_state=SEED, n_jobs=-1)
                # fit on the training set only
                X_train_clusters = kmeans.fit_transform(X_train[useful_features])
                X_valid_clusters = kmeans.transform(X_valid[useful_features])
                # convert numpy.ndarrays back to properly-labeled pandas.DataFrames
                X_train_clusters = pd.DataFrame(X_train_clusters, columns=cluster_cols, index=X_train.index)
                X_valid_clusters = pd.DataFrame(X_valid_clusters, columns=cluster_cols, index=X_valid.index)
                # join the cluster-distance features to the training and validation sets
                X_train = X_train.join(X_train_clusters)
                X_valid = X_valid.join(X_valid_clusters)
            
        
        # now, PCA dimensionality reduction
        if pca_components != 'NO':
            pca = PCA(n_components=pca_components, random_state=42)
            X_train = pca.fit_transform(X_train)
            X_valid = pca.transform(X_valid)
            
        # define models
        model = XGBClassifier(
            booster='gbtree',
            tree_method='gpu_hist',
            random_state=42,
            n_jobs=-1, 
            verbosity=1, 
            objective='binary:logistic',
            **params)
        model.fit(X_train, y_train)
        y_valid_preds = model.predict_proba(X_valid)[:,1]

        # add the fold-model's OOF preds and ground truths to the out-of-loop lists
        oof_preds.extend(y_valid_preds)
        oof_y.extend(y_valid)


        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
#         dump(model, Path(runpath/f"{library}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for is {model_valid_auc}")
    
    return model_valid_auc

In [20]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    # model config
    "library": 'xgboost',
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
#     "scaler": "sklearn.preprocessing.StandardScaler()", # TODO: experiment with others (but imputation may be slow)
#     "scale_b4_impute": False,
#     "imputer": "sklearn.impute.SimpleImputer(strategy='median', add_indicator=True)",
#     "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': SEED,
    'optuna': True,
#     'optuna_trials': 50,
#     'subsample': 1,
#     'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
#     'kfolds': 1, # if 1, that means just doing holdout
#     'test_size': 0.2,
    # these are XGBoost default (my choice) params 
#     "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
#     "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
#     "n_estimators": 200, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "n_jobs": -1,
#     "verbosity": 1,
#     "subsample": 1,
#     'features_created': False,
#     'feature_creator': None,
}

wandb_kwargs = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202110_Kaggle_tabular_playground',
    'tags': ['sweep'],
    'notes': "Sweep for preprocessing techniques on dataset",
    'config': exmodel_config,
}

In [21]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


In [22]:
# study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(SEED)), study_name='dataset_20211026')
study = load()

[32m[I 2021-10-26 12:44:50,241][0m A new study created in memory with name: dataset_20211026[0m


In [31]:
for x in range(11,100):
    study.optimize(objective, n_trials = 1, callbacks = [wandbc]) #n_jobs = multiprocessing.cpu_count())
    dump(study, filename=studypath/f'optuna_dataset_study_trial{x}_20211026.joblib')

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.7517760922401533
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.7539213843954606
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.7590118210311512
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.7539179052878628
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.7523735217328041


[32m[I 2021-10-26 16:48:30,187][0m Trial 12 finished with value: 0.7542908472902581 and parameters: {'encoder_name': 'target', 'encode_before_kmeans': False, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.7542908472902581
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8460498790948396
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8487617952918354
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8490002113547026
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8448842279545304
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8438420555278117


[32m[I 2021-10-26 17:10:11,751][0m Trial 13 finished with value: 0.846609851097209 and parameters: {'encoder_name': 'loo', 'encode_before_kmeans': False, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.846609851097209
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-26 17:31:29,665][0m Trial 14 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8463829433197478
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8486140766450826
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8488299945892275
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8447333654639602
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8437706457144185


[32m[I 2021-10-26 17:54:21,197][0m Trial 15 finished with value: 0.8465627881689273 and parameters: {'encoder_name': 'catboost', 'encode_before_kmeans': False, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8465627881689273
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 18:15:04,499][0m Trial 16 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8461843758043033
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8487207689546775
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8489651958254386
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8449334972530584
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8439892698546787


[32m[I 2021-10-26 18:42:14,352][0m Trial 17 finished with value: 0.8466567050492937 and parameters: {'encoder_name': 'hashing', 'encode_before_kmeans': False, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8466567050492937
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 19:03:00,407][0m Trial 18 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 19:23:47,289][0m Trial 19 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 19:44:33,812][0m Trial 20 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 20:05:18,953][0m Trial 21 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 20:26:01,234][0m Trial 22 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 20:46:43,532][0m Trial 23 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-26 21:07:59,564][0m Trial 24 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 21:29:39,342][0m Trial 25 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8463829433197478
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8486140766450826
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8488299945892275
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8447333654639602
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8437706457144185


[32m[I 2021-10-26 21:53:05,025][0m Trial 26 finished with value: 0.8465627881689273 and parameters: {'encoder_name': 'catboost', 'encode_before_kmeans': False, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8465627881689273
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 22:14:16,630][0m Trial 27 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 22:34:54,607][0m Trial 28 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 22:55:33,227][0m Trial 29 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 23:16:10,713][0m Trial 30 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 23:36:47,343][0m Trial 31 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-26 23:57:25,473][0m Trial 32 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 00:18:03,317][0m Trial 33 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-27 00:39:05,279][0m Trial 34 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-27 01:00:05,868][0m Trial 35 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-27 01:21:08,197][0m Trial 36 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 01:41:44,437][0m Trial 37 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 02:02:21,316][0m Trial 38 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 02:22:58,997][0m Trial 39 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 02:43:36,490][0m Trial 40 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 03:04:13,629][0m Trial 41 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 03:24:50,885][0m Trial 42 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 03:45:27,189][0m Trial 43 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-27 04:06:29,938][0m Trial 44 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8468150822382937
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8491576487090565
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.849561538223476
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8453452025717385
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8444498070703212


[32m[I 2021-10-27 04:27:32,069][0m Trial 45 finished with value: 0.8471726725487676 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': 'mle'}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.8471726725487676
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 04:48:10,197][0m Trial 46 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 05:08:46,144][0m Trial 47 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 05:29:25,541][0m Trial 48 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 05:50:04,653][0m Trial 49 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8467395508145958
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8490648949947982
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495268213689964
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8452497867521451
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8443698607826702


[32m[I 2021-10-27 06:10:48,434][0m Trial 50 finished with value: 0.847097402563946 and parameters: {'encoder_name': None, 'feature_selection': None, 'k_means_method': None, 'pca_components': None}. Best is trial 6 with value: 0.8529520938646249.[0m


Valid AUC score for is 0.847097402563946
FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8460498790948396
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8487617952918354
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8490002113547026
FOLD 3
---------------------------------------------------


KeyboardInterrupt: 

In [32]:
study.best_trial.params

{'encoder_name': 'loo',
 'encode_before_kmeans': True,
 'feature_selection': 'Boruta',
 'k_means_method': 100,
 'k_means_clusters': 7,
 'pca_components': 'NO'}

In [33]:
wandb.log({'best_dataset_params': study.best_trial.params})
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
encode_before_kmeans,▁████████▁▁▁▁▁
k_means_clusters,▄▂▁▁█▄█
k_means_method,█▃██▁▃
pca_components,██▁█
value,█▃▂▆█▄▇▇▁███████████████████████████████

0,1
encode_before_kmeans,False
k_means_clusters,12
value,0.8471


Now, the baseline for comparison:

In [35]:
# define k-fold splitter
kfold = KFold(n_splits=5, shuffle=False)

# initialize lists for out-of-fold preds and ground truth
oof_preds, oof_y = [], []

for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
    print(f"FOLD {fold}")
    print("---------------------------------------------------")

    y_train, y_valid = y[train_ids], y[valid_ids] # slicing syntax works on both pandas.Series and numpy.ndarray
    # category_encoders expects pandas.DataFrames
    X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for slicing


    # define models
    model = XGBClassifier(
        booster='gbtree',
        tree_method='gpu_hist',
        random_state=42,
        n_jobs=-1, 
        verbosity=1, 
        objective='binary:logistic',
        **params)
    model.fit(X_train, y_train)
    y_valid_preds = model.predict_proba(X_valid)[:,1]

    # add the fold-model's OOF preds and ground truths to the out-of-loop lists
    oof_preds.extend(y_valid_preds)
    oof_y.extend(y_valid)


    fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
    print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
#         dump(model, Path(runpath/f"{library}_fold{fold}_rs{random_state}_model.joblib"))

model_valid_auc = roc_auc_score(oof_y, oof_preds)
print(f"Valid AUC score for is {model_valid_auc}")

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8562083823462339
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8583507447326876
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.858679604843354
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8546676893529576
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8548239360305142
Valid AUC score for is 0.8566651115202035


So, the best of the sweep was trial 6 with AUC of 0.8529520938646249, but the straight-up analysis without any bells and whistles gets 0.8566651115202035. 

Conclusion: best to forget about preprocessing. $\blacksquare$