# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = False

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"sweep_xgboost_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    !pip install --upgrade xgboost

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    !pip install catboost

    # lighgbm gpu compatible
    !git clone --recursive https://github.com/Microsoft/LightGBM
    ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
    
    # this part is from https://github.com/rapidsai/gputreeshap/issues/24
    !pip install cmake --upgrade
    # !pip install sklearn --upgrade
    !git clone --recursive https://github.com/dmlc/xgboost
    %cd /content/xgboost
    !mkdir build
    %cd build
    !cmake .. -DUSE_CUDA=ON
    !make -j4
    %cd /content/xgboost/python-package
    !python setup.py install --use-cuda --use-nccl
    !/opt/bin/nvidia-smi
    !pip install shap
    

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [8]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
config_defaults = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "scaler": MinMaxScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": None,
    "imputer": None,
    "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    # these are XGBoost default (my choice) params 
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 200, 
    "max_depth": 3,
    "learning_rate": 0.1,
    "n_jobs": -1,
    "verbosity": 1,
    "subsample": 1,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [9]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
# df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
# features = [x for x in df.columns if x != 'claim']
# X = df[features]
# y = df.claim

# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
# X = pd.read_feather(datapath/'X_StandardScaled_KNNImputed_5NN.feather') 
# y = load(datapath/'y.joblib')    
# X.index.name = 'id'
# y.index.name = 'id'

In [11]:
# scaler = exmodel_config['scaler']()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

In [12]:
# X.head()

In [13]:
# y.head()

## Sweep Config

In [14]:
# XGBOOST
sweep_config = {
    "method": "bayes", # try grid or random
    "metric": {
      "name": "auc",
      "goal": "maximize"   
    },
    "parameters": {
        "max_depth": {
            "distribution": "int_uniform", 
            "min": 2,
            "max": 5,
        },
        "learning_rate": {
            "distribution": "uniform", 
            "min": 0.001,
            "max": 0.4
            #             "values": [0.1, 0.2, 0.3]
        },
        "n_estimators": {
            "distribution": "int_uniform",
            "min": 100,
            "max": 2000,
#             "values": [50, 125, 200]
        },
#         "scaler": {
#             "distribution": "constant",
#             "values": [MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler],
#         },
        "reg_alpha": {
            "distribution": "uniform",
            "min": 0,
            "max": 4,
        },
        "reg_lambda": {
            "distribution": "uniform",
            "min": 1,
            "max": 5,
        },
        "subsample": {
            "distribution": "uniform",
            "min": 0.6,
            "max": 1,
        }
    }
}

### Model Config

In [15]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
#     if library == 'xgboost':
#         config['model'] = XGBClassifier()
#     elif library == 'lightgbm':
#         config['model'] = LGBMClassifier()
#     elif library == 'catboost':
#         config['model'] = CatBoostClassifier()
#     else:
#         print("Invalid library")
#         return None
    
    # library-specific config
#     if config['model'] in [XGBClassfier, LGBMClassifier]:
    if library in ['xgboost', 'lightgbm']:
#         config['reg_alpha'] = None
        config['n_jobs'] = -1
        

#     if config['model'] == XGBClassifier:
    if library == 'xgboost':
#         config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
#         config['eval_metric'] = 'logloss',
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
#         config['reg_alpha'] = 
#         config['n_estimators'] = 300

#     if config['model'] == LGBMClassifier:
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default
        config['n_estimators'] = 500

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 2000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [16]:
config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['sweep'],
    'notes': "Sweep for XGBoost, with scaling but no imputation or feature generation/selection",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [17]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [18]:
# X.columns

In [19]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [20]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [21]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [22]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [23]:
# X = X[features[1:]]

# Training

In [24]:
# def train(X_train, X_valid, y_train, y_valid, model_config, 
#                                               random_state=42,
#                                               exmodel_config=exmodel_config, 
#                                               config_run=config_run):#, scaler): # passed in via config dict for now
#     """
#     Basic training function. Note that some of the options passed via the argument are
#     in fact hard-coded in, to avoid inconveniences.
#     :param X_train: the training set features
#     :param X_valid: the validation set features
#     :param y_train: the training set targets
#     :param y_valid: the validation set targets
#     :param random_staKFold: for reproducibility
#     :param exmodel_config: dict containing configuration details including the library 
#                             (thus model) used, preprocessing, and cross-validation
#     :param model_config: dict containing hyperparameter specifications for the model
#     :param config_run: dict containing wandb run configuration (name, etc)
#     """
    
    
#     wandb.init(
#         project="202109_Kaggle_tabular_playground",
#         save_code=True,
#         tags=config_run['tags'],
#         name=config_run['name'],
#         notes=config_run['notes'],
#         config=exmodel_config)   
        
#     if exmodel_config['library'] == 'xgboost':
#         model = XGBClassifier(
# #             tree_method=config['tree_method'],
# #             booster=config['booster'],
# #             n_estimators=config['n_estimators'], 
# #             max_depth=config['max_depth'],
# #             learning_rate=config['learning_rate'], 
# #             subsample=config['subsample'],
# #             reg_alpha=config['reg_alpha'],
# #             reg_lambda=config['reg_lambda'],
#             random_state=random_state,
#             n_jobs=model_config['n_jobs'], 
#             verbosity=model_config['verbosity'], 
#             objective=model_config['objective'],
# #             eval_metric=model_config['eval_metric'],
#             tree_method=model_config['tree_method'],
#             n_estimators=model_config['n_estimators'],
#         )
        
#         model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
# #                                     eval_metric=model_config['eval_metric'],
#                  )


#     elif exmodel_config['library'] == 'lightgbm':
#         model = LGBMClassifier(
# #             boosting_type=model_config['boosting_type'],
# #             max_depth=model_config['max_depth']
#             # TODO
#             random_state=random_state,
#             n_jobs=model_config['n_jobs'],
#             objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
#             boosting_type=model_config['boosting_type'],
#             device_type=model_config['device_type'],
#             n_estimators=model_config['n_estimators'],

#         )
        
#         model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
# #                                     eval_metric=model_config['eval_metric'],
#                  )
        
#     elif exmodel_config['library'] == 'catboost':
#         print("CatBoost, therefore no WandB callback.")
#         model = CatBoostClassifier(
# #             n_estimators=config['n_estimators'],
# #             learning_rate=config['learning_rate'],
# #             max_depth=config['max_depth'],
#             task_type=model_config['task_type'],
#     #         n_jobs=config['n_jobs'],
#     #         verbosity=config['verbosity'],
#     #         subsample=config['subsample'],
#             n_estimators=model_config['n_estimators'],
#             random_state=random_state,
#             # objective='Logloss', # default, accepts only one
# #             custom_metrics=model_config['custom_metrics'],
#     #         bootstrap_type=config['bootstrap_type'],
#     #         device:config['device']
#         ) 
#         model.fit(X_train, y_train)
#     y_train_pred = model.predict(X_train)
#     train_loss = log_loss(y_train, y_train_pred)
#     train_auc = roc_auc_score(y_train, y_train_pred)
#     wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

#     if exmodel_config['library'] == 'catboost':
#         print(model.get_all_params())
#         wandb.log(model.get_all_params())
#     else:
#         wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
#     y_pred = model.predict(X_valid)
# #     mse = mean_squared_error(y_valid, y_pred)
# #     rmse = math.sqrt(abs(mse))
#     valid_loss = log_loss(y_valid, y_pred)
#     valid_auc = roc_auc_score(y_valid, y_pred)
#     wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
#     print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
# #     wandb.finish()   
#     return model
    

In [25]:
def train_xgboost_sweep():
    """
    Sweep training function. No parameters.
    """
    
    wandb.init(
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=config_defaults)   
        
    config = wandb.config
    # putting all the data extraction, scaling, splitting here
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
    features = [x for x in df.columns if x != 'claim']
    X = df[features]
    y = df.claim
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X = pd.DataFrame(X_scaled, columns=X.columns)    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=config['test_size'], 
                                                      random_state=config['random_state'],
                                                     )
    # parallel this for other libraries
    model = XGBClassifier(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
        reg_alpha=config['reg_alpha'],
        reg_lambda=config['reg_lambda'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#             random_state=random_state,
#             n_jobs=model_config['n_jobs'], 
#             verbosity=model_config['verbosity'], 
#             objective=model_config['objective'],
#             tree_method=model_config['tree_method'],
#             n_estimators=model_config['n_estimators'],
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
             )

    y_train_pred = model.predict(X_train)
    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    y_pred = model.predict(X_valid)
#     mse = mean_squared_error(y_valid, y_pred)
#     rmse = math.sqrt(abs(mse))
    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
#     return model
    

In [26]:
# def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
#     """
#     Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
#     If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
#     :param kfolds: int specifying number of k-folds to use in cross-validation
#     :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
#     """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#         X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=exmodel_config['test_size'], 
#                                                       random_state=random_state,
#                                                      )
#         model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
#                                                     model_config=model_config,
#                                                     config_run=config_run)
#         wandb.finish()
        
#     else:
#         X, y = X.to_numpy(), y.to_numpy()
#         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
#         models = {}
#         model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
#         (model_path).mkdir(exist_ok=True)
#         for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#             if fold < start_fold:
#                 continue
#             else:
#                 print(f"FOLD {fold}")
#                 print("---------------------------------------------------")
#                 X_train, X_valid = X[train_ids], X[valid_ids]
#                 y_train, y_valid = y[train_ids], y[valid_ids]
#                 model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
#                                                     model_config=model_config,
#                                                     config_run=config_run)
#                 wandb.log({'fold': fold})
#                 models[fold] = model
#                 dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
#                 wandb.finish()
#         return models
        

# Interface

## Hyperparameter Sweep

In [27]:
sweep_id = wandb.sweep(sweep_config, project="202109_Kaggle_tabular_playground")

Create sweep with ID: icac24c5
Sweep URL: https://wandb.ai/hushifang/202109_Kaggle_tabular_playground/sweeps/icac24c5


In [28]:
wandb.agent(sweep_id, train_xgboost_sweep)

[34m[1mwandb[0m: Agent Starting Run: svxz42t3 with config:
[34m[1mwandb[0m: 	learning_rate: 0.054284173494598666
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 714
[34m[1mwandb[0m: 	reg_alpha: 3.4843326638962218
[34m[1mwandb[0m: 	reg_lambda: 1.0174714614570193
[34m[1mwandb[0m: 	subsample: 0.6154838408254518
[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)




Valid log-loss is 8.86194006331488
Valid AUC is 0.7433300642939508


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.77983
train_auc,0.74573
_runtime,443
_timestamp,1630902541
_step,716
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: zrx9hygx with config:
[34m[1mwandb[0m: 	learning_rate: 0.2361499684339052
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 1682
[34m[1mwandb[0m: 	reg_alpha: 2.4237778778285004
[34m[1mwandb[0m: 	reg_lambda: 4.968625013908704
[34m[1mwandb[0m: 	subsample: 0.7317279671777888




Valid log-loss is 9.155074442571742
Valid AUC is 0.7348012992931079


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.59565
train_auc,0.78002
_runtime,2098
_timestamp,1630904652
_step,1684
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: wjynqmd3 with config:
[34m[1mwandb[0m: 	learning_rate: 0.32813060062109173
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 1851
[34m[1mwandb[0m: 	reg_alpha: 0.2865810367764605
[34m[1mwandb[0m: 	reg_lambda: 4.8976374064470605
[34m[1mwandb[0m: 	subsample: 0.972196985681438




Valid log-loss is 9.41053590546609
Valid AUC is 0.7274205204484459


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,5.17189
train_auc,0.85022
_runtime,2903
_timestamp,1630907565
_step,1853
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7rysirqc with config:
[34m[1mwandb[0m: 	learning_rate: 0.3963042246765197
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 1593
[34m[1mwandb[0m: 	reg_alpha: 1.4226667817075405
[34m[1mwandb[0m: 	reg_lambda: 4.467524161222173
[34m[1mwandb[0m: 	subsample: 0.9257881662151229




Valid log-loss is 9.178326748211552
Valid AUC is 0.734089381146345


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.75148
train_auc,0.7465
_runtime,951
_timestamp,1630908564
_step,1595
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: r06sdt8f with config:
[34m[1mwandb[0m: 	learning_rate: 0.05093653230927352
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 157
[34m[1mwandb[0m: 	reg_alpha: 1.1640911778526637
[34m[1mwandb[0m: 	reg_lambda: 4.175217740601201
[34m[1mwandb[0m: 	subsample: 0.8722605455451715




Valid log-loss is 9.941806305283466
Valid AUC is 0.7118390439050606


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,9.86349
train_auc,0.71419
_runtime,98
_timestamp,1630908666
_step,159
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: puv8ov08 with config:
[34m[1mwandb[0m: 	learning_rate: 0.3088524081306037
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 159
[34m[1mwandb[0m: 	reg_alpha: 3.1122737328270547
[34m[1mwandb[0m: 	reg_lambda: 3.8786792239255083
[34m[1mwandb[0m: 	subsample: 0.9427821772381894




Valid log-loss is 9.096841222107312
Valid AUC is 0.7364676165250474


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.95348
train_auc,0.74066
_runtime,146
_timestamp,1630908866
_step,161
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mly26js9 with config:
[34m[1mwandb[0m: 	learning_rate: 0.3533032295278778
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 390
[34m[1mwandb[0m: 	reg_alpha: 2.037994067057936
[34m[1mwandb[0m: 	reg_lambda: 3.094454681952513
[34m[1mwandb[0m: 	subsample: 0.946584495357773




Valid log-loss is 9.18013204887763
Valid AUC is 0.7340605530442644


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.99416
train_auc,0.76846
_runtime,622
_timestamp,1630909507
_step,392
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jmgc9c5y with config:
[34m[1mwandb[0m: 	learning_rate: 0.3803905186877355
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 363
[34m[1mwandb[0m: 	reg_alpha: 2.299948910834581
[34m[1mwandb[0m: 	reg_lambda: 3.7303933181260973
[34m[1mwandb[0m: 	subsample: 0.8429270405767494




Valid log-loss is 9.238904010216439
Valid AUC is 0.732358728693898


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.01395
train_auc,0.76789
_runtime,579
_timestamp,1630910151
_step,365
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6n4sl2yx with config:
[34m[1mwandb[0m: 	learning_rate: 0.3065750976770167
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 1633
[34m[1mwandb[0m: 	reg_alpha: 3.536937917345897
[34m[1mwandb[0m: 	reg_lambda: 1.7617361648791023
[34m[1mwandb[0m: 	subsample: 0.6937301311573739


[34m[1mwandb[0m: Network error resolved after 0:00:38.321479, resuming normal operation.


Valid log-loss is 9.238005264298154
Valid AUC is 0.7324099470549422


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.30107
train_auc,0.78856
_runtime,1957
_timestamp,1630912141
_step,1635
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: 01drscmo with config:
[34m[1mwandb[0m: 	learning_rate: 0.32585832668246173
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 1337
[34m[1mwandb[0m: 	reg_alpha: 2.3144112485264006
[34m[1mwandb[0m: 	reg_lambda: 2.342296339251564
[34m[1mwandb[0m: 	subsample: 0.6482597028814413




Valid log-loss is 9.517803814229822
Valid AUC is 0.7243150447397545


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,6.11616
train_auc,0.82288
_runtime,2030
_timestamp,1630914179
_step,1339
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: 4fyi4j26 with config:
[34m[1mwandb[0m: 	learning_rate: 0.35316397900511154
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 103
[34m[1mwandb[0m: 	reg_alpha: 1.029173494082722
[34m[1mwandb[0m: 	reg_lambda: 4.148316356838679
[34m[1mwandb[0m: 	subsample: 0.6039759332626654




Valid log-loss is 9.145336522622248
Valid AUC is 0.7350578671784266


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.94893
train_auc,0.74079
_runtime,126
_timestamp,1630914309
_step,105
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: ghkn9djm with config:
[34m[1mwandb[0m: 	learning_rate: 0.13120791869869255
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 885
[34m[1mwandb[0m: 	reg_alpha: 2.2028330190962158
[34m[1mwandb[0m: 	reg_lambda: 4.778152835793538
[34m[1mwandb[0m: 	subsample: 0.6962326584117161




Valid log-loss is 9.078993174086085
Valid AUC is 0.7369832865762466


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.58251
train_auc,0.75141
_runtime,1054
_timestamp,1630915369
_step,887
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: bxmelhd8 with config:
[34m[1mwandb[0m: 	learning_rate: 0.15706474020127317
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 1656
[34m[1mwandb[0m: 	reg_alpha: 1.8988206103660294
[34m[1mwandb[0m: 	reg_lambda: 1.6808116708666985
[34m[1mwandb[0m: 	subsample: 0.8098639252844305




Valid log-loss is 9.080616713052425
Valid AUC is 0.7369457315014964


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.99484
train_auc,0.76845
_runtime,2017
_timestamp,1630917390
_step,1658
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zo0qwc7y with config:
[34m[1mwandb[0m: 	learning_rate: 0.03040207355687562
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 902
[34m[1mwandb[0m: 	reg_alpha: 0.862950177271244
[34m[1mwandb[0m: 	reg_lambda: 2.4424372957068385
[34m[1mwandb[0m: 	subsample: 0.8627450955234386




Valid log-loss is 8.858154369716202
Valid AUC is 0.7434418618176775


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.74004
train_auc,0.74689
_runtime,752
_timestamp,1630918185
_step,904
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: e47xd2o7 with config:
[34m[1mwandb[0m: 	learning_rate: 0.045941643286900154
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 1361
[34m[1mwandb[0m: 	reg_alpha: 2.234281526777497
[34m[1mwandb[0m: 	reg_lambda: 1.8489652101483092
[34m[1mwandb[0m: 	subsample: 0.9786549014995823




Valid log-loss is 9.037529146775514
Valid AUC is 0.7381920045219102


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.8679
train_auc,0.74314
_runtime,1183
_timestamp,1630919374
_step,1363
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 87zm3y2o with config:
[34m[1mwandb[0m: 	learning_rate: 0.13642252588629208
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 1053
[34m[1mwandb[0m: 	reg_alpha: 1.364987513215619
[34m[1mwandb[0m: 	reg_lambda: 2.882655498152568
[34m[1mwandb[0m: 	subsample: 0.9270531170256034




Valid log-loss is 9.105854211279526
Valid AUC is 0.7361960870720553


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.79136
train_auc,0.74535
_runtime,940
_timestamp,1630920352
_step,1055
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: e7j3vsgi with config:
[34m[1mwandb[0m: 	learning_rate: 0.2044221322419874
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 1192
[34m[1mwandb[0m: 	reg_alpha: 3.744891529871603
[34m[1mwandb[0m: 	reg_lambda: 2.8462722189900296
[34m[1mwandb[0m: 	subsample: 0.6900749481805675




Valid log-loss is 9.144794108968625
Valid AUC is 0.7350587534600783


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.93108
train_auc,0.74129
_runtime,693
_timestamp,1630921049
_step,1194
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1gilnmf6 with config:
[34m[1mwandb[0m: 	learning_rate: 0.015836736957902272
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 224
[34m[1mwandb[0m: 	reg_alpha: 2.610291048933759
[34m[1mwandb[0m: 	reg_lambda: 4.588892561395895
[34m[1mwandb[0m: 	subsample: 0.8823207072261525




Valid log-loss is 9.653000842959786
Valid AUC is 0.7202627574213598


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,9.51695
train_auc,0.72427
_runtime,259
_timestamp,1630921323
_step,226
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: o2dgatsy with config:
[34m[1mwandb[0m: 	learning_rate: 0.13445691981721666
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 1241
[34m[1mwandb[0m: 	reg_alpha: 0.8722825544317314
[34m[1mwandb[0m: 	reg_lambda: 2.045551710296159
[34m[1mwandb[0m: 	subsample: 0.6043139633850757




Valid log-loss is 9.102789617081598
Valid AUC is 0.7362867303360006


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.75013
train_auc,0.74655
_runtime,1058
_timestamp,1630922385
_step,1243
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 55hxnnoj with config:
[34m[1mwandb[0m: 	learning_rate: 0.1470303387632113
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 816
[34m[1mwandb[0m: 	reg_alpha: 2.5564150663734404
[34m[1mwandb[0m: 	reg_lambda: 1.5033811322885717
[34m[1mwandb[0m: 	subsample: 0.8229624631988434




Valid log-loss is 9.085663553267027
Valid AUC is 0.7367895866017327


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.54587
train_auc,0.75247
_runtime,984
_timestamp,1630923475
_step,818
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: cxiqa8px with config:
[34m[1mwandb[0m: 	learning_rate: 0.1783361489938947
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 303
[34m[1mwandb[0m: 	reg_alpha: 1.5809170645698147
[34m[1mwandb[0m: 	reg_lambda: 4.3614313624631755
[34m[1mwandb[0m: 	subsample: 0.9306044167893874




Valid log-loss is 9.08025538497327
Valid AUC is 0.7369489713745023


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.81061
train_auc,0.7448
_runtime,366
_timestamp,1630923844
_step,305
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: rwij4v48 with config:
[34m[1mwandb[0m: 	learning_rate: 0.20895120115941623
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 745
[34m[1mwandb[0m: 	reg_alpha: 0.8361991259177839
[34m[1mwandb[0m: 	reg_lambda: 3.0744949163543778
[34m[1mwandb[0m: 	subsample: 0.867091715541134




Valid log-loss is 9.12766824966108
Valid AUC is 0.735563537476949


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.78605
train_auc,0.7455
_runtime,658
_timestamp,1630924507
_step,747
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3nzy8hoa with config:
[34m[1mwandb[0m: 	learning_rate: 0.017593348412406586
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 1680
[34m[1mwandb[0m: 	reg_alpha: 1.415004365510422
[34m[1mwandb[0m: 	reg_lambda: 4.029007427795565
[34m[1mwandb[0m: 	subsample: 0.9981807290097384




Valid log-loss is 8.965600242192046
Valid AUC is 0.7403093119890809


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.8683
train_auc,0.74316
_runtime,961
_timestamp,1630925481
_step,1682
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: 57vmh6ck with config:
[34m[1mwandb[0m: 	learning_rate: 0.08213348043762587
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 1147
[34m[1mwandb[0m: 	reg_alpha: 3.879096183417685
[34m[1mwandb[0m: 	reg_lambda: 1.8357314828851057
[34m[1mwandb[0m: 	subsample: 0.7850559665924




Valid log-loss is 9.079894611984615
Valid AUC is 0.7369574437148845


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.64412
train_auc,0.74962
_runtime,1363
_timestamp,1630926849
_step,1149
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ua9ojo7h with config:
[34m[1mwandb[0m: 	learning_rate: 0.174790379756762
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 653
[34m[1mwandb[0m: 	reg_alpha: 3.9234645968668036
[34m[1mwandb[0m: 	reg_lambda: 3.752427026991685
[34m[1mwandb[0m: 	subsample: 0.6472212272398394




Valid log-loss is 9.122982624838091
Valid AUC is 0.7357153802734877


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.17539
train_auc,0.76321
_runtime,982
_timestamp,1630927886
_step,655
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: h340hhh2 with config:
[34m[1mwandb[0m: 	learning_rate: 0.060710003964986124
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 120
[34m[1mwandb[0m: 	reg_alpha: 0.3670520489840903
[34m[1mwandb[0m: 	reg_lambda: 3.3660404716655914
[34m[1mwandb[0m: 	subsample: 0.9587125807190822




Valid log-loss is 9.232411540490647
Valid AUC is 0.7325248431043273


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,9.08712
train_auc,0.73678
_runtime,145
_timestamp,1630928095
_step,122
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: 5dk8g8kl with config:
[34m[1mwandb[0m: 	learning_rate: 0.02031641800605311
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 1615
[34m[1mwandb[0m: 	reg_alpha: 1.5388463332315672
[34m[1mwandb[0m: 	reg_lambda: 3.891020051602351
[34m[1mwandb[0m: 	subsample: 0.6716659117539477




Valid log-loss is 8.946849742084389
Valid AUC is 0.740841264829669


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.66639
train_auc,0.749
_runtime,2307
_timestamp,1630930406
_step,1617
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: i648zona with config:
[34m[1mwandb[0m: 	learning_rate: 0.3645326345124088
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 902
[34m[1mwandb[0m: 	reg_alpha: 0.8304421573507264
[34m[1mwandb[0m: 	reg_lambda: 3.533758299555173
[34m[1mwandb[0m: 	subsample: 0.9202003218325668




Valid log-loss is 9.298759638598145
Valid AUC is 0.7306441947431559


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,6.72655
train_auc,0.80519
_runtime,1446
_timestamp,1630931856
_step,904
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: q3fhh270 with config:
[34m[1mwandb[0m: 	learning_rate: 0.2302427339790394
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 196
[34m[1mwandb[0m: 	reg_alpha: 2.650952278729264
[34m[1mwandb[0m: 	reg_lambda: 2.0836562952271143
[34m[1mwandb[0m: 	subsample: 0.6282197071276847




Valid log-loss is 9.064391252413827
Valid AUC is 0.7374148765338717


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.93884
train_auc,0.74109
_runtime,178
_timestamp,1630932038
_step,198
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7wqrj0q2 with config:
[34m[1mwandb[0m: 	learning_rate: 0.06379912738531882
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 886
[34m[1mwandb[0m: 	reg_alpha: 0.7100447670978758
[34m[1mwandb[0m: 	reg_lambda: 1.4152743540480635
[34m[1mwandb[0m: 	subsample: 0.6376621097898615




Valid log-loss is 9.042036707719689
Valid AUC is 0.7380662916406372


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.54934
train_auc,0.75238
_runtime,1327
_timestamp,1630933398
_step,888
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9znenwmb with config:
[34m[1mwandb[0m: 	learning_rate: 0.051224843695380086
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 1414
[34m[1mwandb[0m: 	reg_alpha: 2.7785952952168897
[34m[1mwandb[0m: 	reg_lambda: 4.169965095621846
[34m[1mwandb[0m: 	subsample: 0.7641832369818607




Valid log-loss is 9.034644790908658
Valid AUC is 0.7382770149796353


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.4327
train_auc,0.75576
_runtime,2183
_timestamp,1630935598
_step,1416
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2xyi4yru with config:
[34m[1mwandb[0m: 	learning_rate: 0.3697742236194748
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 1854
[34m[1mwandb[0m: 	reg_alpha: 0.2917859034576362
[34m[1mwandb[0m: 	reg_lambda: 2.6844594953528103
[34m[1mwandb[0m: 	subsample: 0.6653445188157959




Valid log-loss is 9.764792134833725
Valid AUC is 0.7171807138225499


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,4.93302
train_auc,0.85714
_runtime,2821
_timestamp,1630938473
_step,1856
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: haq2vtz1 with config:
[34m[1mwandb[0m: 	learning_rate: 0.359156681922107
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 832
[34m[1mwandb[0m: 	reg_alpha: 2.1317397339768704
[34m[1mwandb[0m: 	reg_lambda: 3.7507714781034127
[34m[1mwandb[0m: 	subsample: 0.9685640263913229




Valid log-loss is 9.161921875617205
Valid AUC is 0.734571948673215


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.57742
train_auc,0.75155
_runtime,777
_timestamp,1630939314
_step,834
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Agent Starting Run: r266xje4 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1667407406055106
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 1069
[34m[1mwandb[0m: 	reg_alpha: 1.7290080807388457
[34m[1mwandb[0m: 	reg_lambda: 3.3115577048940463
[34m[1mwandb[0m: 	subsample: 0.8076113870293917




Valid log-loss is 9.106035743430562
Valid AUC is 0.7362026502424268


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.33534
train_auc,0.75857
_runtime,1318
_timestamp,1630940636
_step,1071
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: loyoxhsd with config:
[34m[1mwandb[0m: 	learning_rate: 0.06977873612123872
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 828
[34m[1mwandb[0m: 	reg_alpha: 0.8846296964533171
[34m[1mwandb[0m: 	reg_lambda: 1.3184468140506094
[34m[1mwandb[0m: 	subsample: 0.9579382738698057




Valid log-loss is 9.02779403566742
Valid AUC is 0.7384750494789905


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.5186
train_auc,0.75327
_runtime,1423
_timestamp,1630942075
_step,830
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


## Runs

In [34]:
library = 'xgboost'
exmodel_config['library'] = library
model_config = model_configurator(library)
xgboost_models = cross_validation(model_config)

FOLD 0
---------------------------------------------------




Valid log-loss is 9.182656366311678
Valid AUC is 0.7339909390942767


VBox(children=(Label(value=' 0.27MB of 0.27MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.6866
train_auc,0.77737
_runtime,582
_timestamp,1630882406
_step,303
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 1
---------------------------------------------------




Valid log-loss is 9.060966569699733
Valid AUC is 0.7374493272406724


VBox(children=(Label(value=' 0.28MB of 0.28MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.69124
train_auc,0.77725
_runtime,582
_timestamp,1630883000
_step,303
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 2
---------------------------------------------------




Valid log-loss is 9.157236225752541
Valid AUC is 0.7348261237392092


VBox(children=(Label(value=' 0.29MB of 0.29MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.68308
train_auc,0.77745
_runtime,602
_timestamp,1630883609
_step,303
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 3
---------------------------------------------------




Valid log-loss is 9.160840850471198
Valid AUC is 0.7347094322023537


VBox(children=(Label(value=' 0.30MB of 0.30MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.69471
train_auc,0.77712
_runtime,659
_timestamp,1630884275
_step,303
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 4
---------------------------------------------------




Valid log-loss is 9.105002088717281
Valid AUC is 0.736265741386542


VBox(children=(Label(value=' 0.30MB of 0.30MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,7.68339
train_auc,0.77746
_runtime,589
_timestamp,1630884870
_step,303
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


{0: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=300, n_jobs=-1, num_parallel_tree=1, random_state=42,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='auto', validate_parameters=1, verbosity=1),
 1: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=300, n_jobs=-1, n

In [None]:
# for scaler in [StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler]:
#     exmodel_config['scaler'] = scaler
#     scaler = scaler()
#     X_scaled = scaler.fit_transform(X)
#     X = pd.DataFrame(X_scaled, columns=X.columns)
#     exmodel_config['library'] = 'lightgbm'
#     model_config = model_configurator('lightgbm')
#     cross_validation(model_config)

In [35]:
# library = 'xgboost'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# xgboost_models = cross_validation(model_config)

library = 'lightgbm'
exmodel_config['library'] = library
model_config = model_configurator(library)
lightgbm_models = cross_validation(model_config)

library = 'catboost'
gpu_available = True
exmodel_config['library'] = library
model_config = model_configurator(library)
catboost_models = cross_validation(model_config)

FOLD 0
---------------------------------------------------


Valid log-loss is 8.649753788149516
Valid AUC is 0.7495285330622696


VBox(children=(Label(value=' 0.32MB of 0.32MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.20492
train_auc,0.76243
_runtime,73
_timestamp,1630885210
_step,503
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 1
---------------------------------------------------


Valid log-loss is 8.571151211404626
Valid AUC is 0.7517893500774024


VBox(children=(Label(value=' 0.31MB of 0.31MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.21263
train_auc,0.76221
_runtime,71
_timestamp,1630885287
_step,503
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 2
---------------------------------------------------


Valid log-loss is 8.634069255358394
Valid AUC is 0.7500094127198139


VBox(children=(Label(value=' 0.32MB of 0.32MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.20744
train_auc,0.76235
_runtime,73
_timestamp,1630885366
_step,503
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 3
---------------------------------------------------


Valid log-loss is 8.629741515384016
Valid AUC is 0.7501290665135417


VBox(children=(Label(value=' 0.32MB of 0.32MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.22376
train_auc,0.76188
_runtime,69
_timestamp,1630885442
_step,503
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 4
---------------------------------------------------


Valid log-loss is 8.57912794289132
Valid AUC is 0.7515808857394004


VBox(children=(Label(value=' 0.33MB of 0.33MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.2265
train_auc,0.7618
_runtime,70
_timestamp,1630885518
_step,503
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 0
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6928148	total: 12.3ms	remaining: 24.5s
1:	learn: 0.6924862	total: 22.3ms	remaining: 22.2s
2:	learn: 0.6921545	total: 32.1ms	remaining: 21.4s
3:	learn: 0.6918300	total: 42.1ms	remaining: 21s
4:	learn: 0.6915174	total: 52.2ms	remaining: 20.8s
5:	learn: 0.6912019	total: 62.1ms	remaining: 20.6s
6:	learn: 0.6908800	total: 72.4ms	remaining: 20.6s
7:	learn: 0.6905571	total: 83.6ms	remaining: 20.8s
8:	learn: 0.6902453	total: 93.7ms	remaining: 20.7s
9:	learn: 0.6899316	total: 104ms	remaining: 20.6s
10:	learn: 0.6896187	total: 113ms	remaining: 20.5s
11:	learn: 0.6893081	total: 123ms	remaining: 20.4s
12:	learn: 0.6889979	total: 134ms	remaining: 20.5s
13:	learn: 0.6886846	total: 144ms	remaining: 20.4s
14:	learn: 0.6883737	total: 154ms	remaining: 20.3s
15:	learn: 0.6880665	total: 163ms	remaining: 20.2s
16:	learn: 0.6877606	total: 173ms	remaining: 20.2s
17:	learn: 0.6874561	total: 184ms	remaining: 20.3s
18:	learn: 0.687

VBox(children=(Label(value=' 0.46MB of 0.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.72075
train_auc,0.74745
_runtime,64
_timestamp,1630885588
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 1
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6928212	total: 10.3ms	remaining: 20.6s
1:	learn: 0.6924856	total: 20.1ms	remaining: 20.1s
2:	learn: 0.6921573	total: 29.7ms	remaining: 19.8s
3:	learn: 0.6918373	total: 39.4ms	remaining: 19.6s
4:	learn: 0.6915218	total: 50ms	remaining: 19.9s
5:	learn: 0.6912055	total: 59.7ms	remaining: 19.8s
6:	learn: 0.6908865	total: 70.3ms	remaining: 20s
7:	learn: 0.6905726	total: 80.8ms	remaining: 20.1s
8:	learn: 0.6902609	total: 90.6ms	remaining: 20s
9:	learn: 0.6899492	total: 109ms	remaining: 21.8s
10:	learn: 0.6896355	total: 125ms	remaining: 22.6s
11:	learn: 0.6893237	total: 135ms	remaining: 22.3s
12:	learn: 0.6890152	total: 145ms	remaining: 22.1s
13:	learn: 0.6887036	total: 154ms	remaining: 21.8s
14:	learn: 0.6883983	total: 164ms	remaining: 21.7s
15:	learn: 0.6880929	total: 173ms	remaining: 21.4s
16:	learn: 0.6877863	total: 183ms	remaining: 21.3s
17:	learn: 0.6874794	total: 193ms	remaining: 21.2s
18:	learn: 0.6871754

VBox(children=(Label(value=' 0.46MB of 0.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.73427
train_auc,0.74707
_runtime,63
_timestamp,1630885659
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 2
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6928163	total: 11.1ms	remaining: 22.3s
1:	learn: 0.6924804	total: 22ms	remaining: 21.9s
2:	learn: 0.6921627	total: 32.6ms	remaining: 21.7s
3:	learn: 0.6918395	total: 43.2ms	remaining: 21.6s
4:	learn: 0.6915186	total: 53.8ms	remaining: 21.5s
5:	learn: 0.6911996	total: 64.5ms	remaining: 21.4s
6:	learn: 0.6908838	total: 75.1ms	remaining: 21.4s
7:	learn: 0.6905652	total: 87ms	remaining: 21.7s
8:	learn: 0.6902511	total: 98ms	remaining: 21.7s
9:	learn: 0.6899327	total: 109ms	remaining: 21.6s
10:	learn: 0.6896204	total: 119ms	remaining: 21.6s
11:	learn: 0.6893110	total: 129ms	remaining: 21.5s
12:	learn: 0.6890027	total: 139ms	remaining: 21.3s
13:	learn: 0.6886929	total: 149ms	remaining: 21.2s
14:	learn: 0.6883792	total: 160ms	remaining: 21.2s
15:	learn: 0.6880698	total: 171ms	remaining: 21.2s
16:	learn: 0.6877616	total: 182ms	remaining: 21.2s
17:	learn: 0.6874522	total: 193ms	remaining: 21.2s
18:	learn: 0.6871480

VBox(children=(Label(value=' 0.69MB of 0.69MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.72201
train_auc,0.7474
_runtime,63
_timestamp,1630885729
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 3
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6928187	total: 10.6ms	remaining: 21.2s
1:	learn: 0.6924912	total: 20.3ms	remaining: 20.3s
2:	learn: 0.6921678	total: 29.9ms	remaining: 19.9s
3:	learn: 0.6918450	total: 39.7ms	remaining: 19.8s
4:	learn: 0.6915301	total: 50.7ms	remaining: 20.2s
5:	learn: 0.6912102	total: 61.5ms	remaining: 20.4s
6:	learn: 0.6908926	total: 71.2ms	remaining: 20.3s
7:	learn: 0.6905745	total: 80.9ms	remaining: 20.2s
8:	learn: 0.6902601	total: 91.7ms	remaining: 20.3s
9:	learn: 0.6899487	total: 102ms	remaining: 20.4s
10:	learn: 0.6896348	total: 113ms	remaining: 20.5s
11:	learn: 0.6893272	total: 132ms	remaining: 21.9s
12:	learn: 0.6890165	total: 152ms	remaining: 23.2s
13:	learn: 0.6887061	total: 166ms	remaining: 23.6s
14:	learn: 0.6884023	total: 176ms	remaining: 23.3s
15:	learn: 0.6880944	total: 186ms	remaining: 23s
16:	learn: 0.6877908	total: 195ms	remaining: 22.8s
17:	learn: 0.6874864	total: 205ms	remaining: 22.6s
18:	learn: 0.687

VBox(children=(Label(value=' 0.69MB of 0.69MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.7267
train_auc,0.74727
_runtime,62
_timestamp,1630885799
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 4
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6928226	total: 11.8ms	remaining: 23.7s
1:	learn: 0.6924947	total: 22.5ms	remaining: 22.5s
2:	learn: 0.6921660	total: 33.2ms	remaining: 22.1s
3:	learn: 0.6918403	total: 43.9ms	remaining: 21.9s
4:	learn: 0.6915252	total: 54.6ms	remaining: 21.8s
5:	learn: 0.6912066	total: 65.2ms	remaining: 21.7s
6:	learn: 0.6908902	total: 76ms	remaining: 21.6s
7:	learn: 0.6905727	total: 86.7ms	remaining: 21.6s
8:	learn: 0.6902607	total: 97.9ms	remaining: 21.7s
9:	learn: 0.6899501	total: 108ms	remaining: 21.5s
10:	learn: 0.6896347	total: 118ms	remaining: 21.3s
11:	learn: 0.6893226	total: 127ms	remaining: 21.1s
12:	learn: 0.6890121	total: 137ms	remaining: 21s
13:	learn: 0.6887001	total: 147ms	remaining: 20.8s
14:	learn: 0.6883899	total: 157ms	remaining: 20.7s
15:	learn: 0.6880826	total: 166ms	remaining: 20.6s
16:	learn: 0.6877757	total: 176ms	remaining: 20.5s
17:	learn: 0.6874688	total: 186ms	remaining: 20.5s
18:	learn: 0.68716

VBox(children=(Label(value=' 0.88MB of 0.88MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.73543
train_auc,0.74702
_runtime,63
_timestamp,1630885869
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


In [40]:
# # this loads models if you need to (or forgot to save them on training above)
# xgboost_models = {}
# xgboost_models_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/baseline_20210905a_152521_5folds/xgboost/')
# for fold in range(5):
#     xgboost_models[fold] = load(xgboost_models_path/f'xgboost_fold{fold}_model.joblib')

# Inference

In [44]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)
# test_df.to_feather(datapath/'test.feather') # issue with index being non-default; fix later
# test_df = pd.read_feather(datapath/'test.feather')

ValueError: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)

In [45]:
test_df.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,6221.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,934.43,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,131.81,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,-26.473,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,5999.4,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [46]:
features = [x for x in test_df.columns if x != 'claim']
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [25]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [26]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [27]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [28]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [29]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

# Scaling
Now, going to scale using `MaxAbsScaler`

In [47]:
scaler = exmodel_config['scaler']()
X_test_scaled = scaler.fit_transform(X_test)
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [49]:
xgboost_preds = {}
for fold in xgboost_models.keys():
    xgboost_preds[fold] = xgboost_models[fold].predict(X_test_scaled)



In [50]:
lightgbm_preds = {}
for fold in lightgbm_models.keys():
    lightgbm_preds[fold] = lightgbm_models[fold].predict(X_test_scaled)

In [51]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_scaled)

In [54]:
preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds/")
preds_path.mkdir(exist_ok=True)

for library in ['xgboost', 'lightgbm', 'catboost']:
    (preds_path/library).mkdir(exist_ok=True)

In [56]:
dump(xgboost_preds, Path(preds_path/'xgboost/xgboost_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/baseline_20210905a_152521_5folds/xgboost/xgboost_preds_dict.joblib']

In [57]:
dump(lightgbm_preds, Path(preds_path/'lightgbm/lightgbm_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/baseline_20210905a_152521_5folds/lightgbm/lightgbm_preds_dict.joblib']

In [58]:
dump(catboost_preds, Path(preds_path/'catboost/catboost_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/baseline_20210905a_152521_5folds/catboost/catboost_preds_dict.joblib']

In [60]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [61]:
final_xgboost_preds = (xgboost_preds[0] + xgboost_preds[1] + xgboost_preds[2] + xgboost_preds[3] + xgboost_preds[4]) / 5
final_lightgbm_preds = (lightgbm_preds[0] + lightgbm_preds[1] + lightgbm_preds[2] + lightgbm_preds[3] + lightgbm_preds[4]) / 5
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [63]:
print(final_xgboost_preds[:10])
print(final_lightgbm_preds[:10])
print(final_catboost_preds[:10])

[0.  0.  0.4 0.  0.  0.  1.  0.  0.  0.8]
[0.  0.  0.2 0.  0.  0.  1.  0.  0.  1. ]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 1.]


In [66]:
sample_df.loc[:, 'claim'] = final_xgboost_preds

In [67]:
sample_df.head()

Unnamed: 0,id,claim
0,957919,0.0
1,957920,0.0
2,957921,0.4
3,957922,0.0
4,957923,0.0


In [68]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [69]:
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_xgboost-mean.csv", index=False)

In [70]:
sample_df.loc[:, 'claim'] = final_lightgbm_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_lightgbm-mean.csv", index=False)

In [71]:
sample_df.loc[:, 'claim'] = final_catboost_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_catboost-mean.csv", index=False)

In [72]:
ensemble_preds = (final_xgboost_preds + final_lightgbm_preds + final_catboost_preds) / 3

In [73]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_ensemble-equal_model_and_fold_weight_mean.csv", index=False)

In [74]:
ensemble_preds = 0.4*final_xgboost_preds + 0.3*final_lightgbm_preds + 0.3*final_catboost_preds

In [75]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_ensemble-0.4xgboost_0.3lightgbm_0.3catboost-equal_fold_weight_mean.csv", index=False)

# Ensembling with CatBoost

In [46]:
catboost_models = {}
saved_models_path = Path('/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/models/CatBoost_ensemble_20210831_144245_5folds/')
for fold in range(5):
    catboost_models[fold] = load(filename=Path(saved_models_path/f'catboost_fold{fold}_model.joblib'))

In [47]:
catboost_models

{0: <catboost.core.CatBoostRegressor at 0x7f1b154ecfa0>,
 1: <catboost.core.CatBoostRegressor at 0x7f1b1548a880>,
 2: <catboost.core.CatBoostRegressor at 0x7f1b154ec0a0>,
 3: <catboost.core.CatBoostRegressor at 0x7f1b1548ac40>,
 4: <catboost.core.CatBoostRegressor at 0x7f1b154ecdf0>}

In [48]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_scaled)

In [50]:
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [51]:
ensemble_preds = 0.6 * final_catboost_preds + 0.4 * final_preds

In [54]:
ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.40583658, 4.58774964, 8.32465697, 7.18375788, 7.13135284,
        9.67367649, 9.96252577, 5.89393404, 7.22270917, 7.53612671]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [58]:
final_ensemble_preds = 0.65 * final_catboost_preds + 0.35 * final_preds

In [59]:
final_ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.4389943 , 4.5923435 , 8.36373235, 7.18870129, 7.10523255,
        9.67708804, 9.96419843, 5.87235472, 7.23656006, 7.52442615]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [60]:
sample_df.loc[:, 'loss'] = final_ensemble_preds

In [61]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.438994
1,250001,4.592343
2,250002,8.363732
3,250003,7.188701
4,250004,7.105233


In [62]:
sample_df.to_csv('XGBoost0.35-Catboost0.65_ensemble_20210831_no_feature_gen.csv', index=False)

# Experiment - fitting model on full training set

In [36]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_s = scaler.fit_transform(X)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_fs = selector.fit_transform(X_s, y)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_fs, y)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [37]:
y_test_preds = model.predict(X_test_fs)



In [38]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [39]:
sample_df.loc[:, 'loss'] = y_test_preds

In [40]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.027956
1,250001,4.305676
2,250002,7.300106
3,250003,6.988875
4,250004,7.316631


In [41]:
sample_df.to_csv('202108241211_XGBoost_fullset.csv', index=False)