# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = False

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"baseline_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    !pip install --upgrade xgboost

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    !pip install catboost

    # lighgbm gpu compatible
    !git clone --recursive https://github.com/Microsoft/LightGBM
    ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
    
    # this part is from https://github.com/rapidsai/gputreeshap/issues/24
    !pip install cmake --upgrade
    # !pip install sklearn --upgrade
    !git clone --recursive https://github.com/dmlc/xgboost
    %cd /content/xgboost
    !mkdir build
    %cd build
    !cmake .. -DUSE_CUDA=ON
    !make -j4
    %cd /content/xgboost/python-package
    !python setup.py install --use-cuda --use-nccl
    !/opt/bin/nvidia-smi
    !pip install shap
    

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [8]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    
# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
# X = pd.read_feather(datapath/'X_StandardScaled_KNNImputed_5NN.feather') 
# y = load(datapath/'y.joblib')    
# X.index.name = 'id'
# y.index.name = 'id'

## Configuration

### Ex-Model Config

In [11]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "scaler": None, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": None,
    "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

### Data 

In [8]:
# # here's how to load the original, unaltered dataset and separate features from targets
df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
features = [x for x in df.columns if x != 'claim']
X = df[features]
y = df.claim

# 

In [9]:
X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,0.11093,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,0.97673,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,0.20102,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-0.01182,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,0.92739,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049


In [10]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: claim, dtype: int64

### Model Config

In [12]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
#     if library == 'xgboost':
#         config['model'] = XGBClassifier()
#     elif library == 'lightgbm':
#         config['model'] = LGBMClassifier()
#     elif library == 'catboost':
#         config['model'] = CatBoostClassifier()
#     else:
#         print("Invalid library")
#         return None
    
    # library-specific config
#     if config['model'] in [XGBClassfier, LGBMClassifier]:
    if library in ['xgboost', 'lightgbm']:
#         config['reg_alpha'] = None
        config['n_jobs'] = -1
        config['n_estimators'] = 300

#     if config['model'] == XGBClassifier:
    if library == 'xgboost':
#         config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
#         config['eval_metric'] = 'logloss',
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
#         config['reg_alpha'] = 

#     if config['model'] == LGBMClassifier:
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 1000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['baseline'],
    'notes': "Initial runs of each model-type, with sane defaults.",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [None]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [None]:
# X.columns

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [None]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [None]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [None]:
# X = X[features[1:]]

# Training

In [16]:
def train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_state: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
#             tree_method=config['tree_method'],
#             booster=config['booster'],
#             n_estimators=config['n_estimators'], 
#             max_depth=config['max_depth'],
#             learning_rate=config['learning_rate'], 
#             subsample=config['subsample'],
#             reg_alpha=config['reg_alpha'],
#             reg_lambda=config['reg_lambda'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            tree_method=model_config['tree_method'],
            n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
            eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            n_estimators=model_config['n_estimators'],

        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
        ) 
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        train_loss = log_loss(y_train, y_train_pred)
        train_auc = roc_auc_score(y_train, y_train_pred)
        wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    y_pred = model.predict(X_valid)
#     mse = mean_squared_error(y_valid, y_pred)
#     rmse = math.sqrt(abs(mse))
    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model
    

In [17]:
def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    if exmodel_config['kfolds'] == 1:
        print("Proceeding with holdout")
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=exmodel_config['test_size'], 
                                                      random_state=random_state,
                                                     )
        model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
        wandb.finish()
        
    else:
        kfold = config['kfold_strategy'](n_splits=kfolds, shuffle=True, random_state=random_state)
        models = {}
        model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
        (model_path).mkdir(exist_ok=True)
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
            if fold < start_fold:
                continue
            else:
                print(f"FOLD {fold}")
                print("---------------------------------------------------")
                X, y = X.to_numpy(), y.to_numpy()
                X_train, X_valid = X[train_ids], X[valid_ids]
                y_train, y_valid = y[train_ids], y[valid_ids]
                model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
                wandb.log({'fold': fold})
                models[fold] = model
                dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
                wandb.finish()
                
        

# Interface

## Runs

Here, let's do the initial baseline for each of the Big Three libraries, largely using their own defaults.

In [18]:
for library in ['xgboost', 'lightgbm']:
    exmodel_config['library'] = library
    model_config = model_configurator(library)
    cross_validation(model_config)

Proceeding with holdout


[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)




Valid log-loss is 9.146419597012132
Valid AUC is 0.7350395710339351


VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1
colsample_bynode,1
colsample_bytree,1
gamma,0
gpu_id,-1
importance_type,gain


0,1
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁
gamma,▁
gpu_id,▁
learning_rate,▁
max_delta_step,▁
max_depth,▁


Proceeding with holdout


Valid log-loss is 8.648491798463809
Valid AUC is 0.7495649333825926


VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1
min_child_samples,20
min_child_weight,0.001
min_split_gain,0.0
n_estimators,300
n_jobs,-1


0,1
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁
min_split_gain,▁
n_estimators,▁
n_jobs,▁
num_leaves,▁
random_state,▁


In [19]:
gpu_available = True
exmodel_config['library'] = 'catboost'
model_config = model_configurator('catboost')
cross_validation(model_config)

Proceeding with holdout


CatBoost, therefore no WandB callback.
Learning rate set to 0.023395
0:	learn: 0.6925348	total: 12.1ms	remaining: 12.1s
1:	learn: 0.6919313	total: 22.1ms	remaining: 11s
2:	learn: 0.6913204	total: 31.9ms	remaining: 10.6s
3:	learn: 0.6907180	total: 41.5ms	remaining: 10.3s
4:	learn: 0.6901401	total: 51.2ms	remaining: 10.2s
5:	learn: 0.6895602	total: 61ms	remaining: 10.1s
6:	learn: 0.6889820	total: 70.9ms	remaining: 10.1s
7:	learn: 0.6884069	total: 80.6ms	remaining: 9.99s
8:	learn: 0.6878375	total: 90.6ms	remaining: 9.97s
9:	learn: 0.6872611	total: 100ms	remaining: 9.94s
10:	learn: 0.6866859	total: 110ms	remaining: 9.9s
11:	learn: 0.6861257	total: 120ms	remaining: 9.91s
12:	learn: 0.6855607	total: 130ms	remaining: 9.91s
13:	learn: 0.6850009	total: 141ms	remaining: 9.96s
14:	learn: 0.6844519	total: 151ms	remaining: 9.94s
15:	learn: 0.6838970	total: 161ms	remaining: 9.9s
16:	learn: 0.6833429	total: 171ms	remaining: 9.88s
17:	learn: 0.6827973	total: 181ms	remaining: 9.85s
18:	learn: 0.6822586

VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,8.71714
train_auc,0.74756
_runtime,19
_timestamp,1630870724
_step,2
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,1000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁█
_timestamp,▁▁█
_step,▁▅█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


# K-fold Cross-validation

In [15]:
# ACTUALLY probably better to save those as pickles or .npy files; I'll generate them later, regardless
# results = {} # for storing k-fold models' predictions

In [16]:
kfold = KFold(n_splits=config['k_folds'], shuffle=True, random_state=config['random_state'])

In [18]:
models = {}

In [19]:
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)

In [20]:
for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#     if fold == 0:
#         continue
#     else:
    print(f"FOLD {fold}")
    print("-----------------------------------------")
    X_train, X_valid = X_scaled[train_ids], X_scaled[valid_ids] # requires X to be a numpy.ndarray
    y_train, y_valid = y[train_ids], y[valid_ids]
    model = train(X_train, X_valid, y_train, y_valid, config)
    wandb.log({'fold': fold})
    models[fold] = model
    dump(model, Path(model_path/f"xgboost_fold{fold}_model.joblib"))
    wandb.finish()



FOLD 0
-----------------------------------------


[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




MSE is 61.33193748555472
RMSE is 7.83147096563313


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mse,61.33194
rmse,7.83147
_runtime,1591.0
_timestamp,1630429873.0
_step,401.0
fold,0.0


0,1
mse,▁
rmse,▁
_runtime,▁▁
_timestamp,▁▁
_step,▁█
fold,▁


FOLD 1
-----------------------------------------


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




MSE is 62.24235031226011
RMSE is 7.889382124872651


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mse,62.24235
rmse,7.88938
_runtime,1539.0
_timestamp,1630431417.0
_step,401.0
fold,1.0


0,1
mse,▁
rmse,▁
_runtime,▁▁
_timestamp,▁▁
_step,▁█
fold,▁


FOLD 2
-----------------------------------------


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




MSE is 61.81231376886642
RMSE is 7.862080753138218


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mse,61.81231
rmse,7.86208
_runtime,1551.0
_timestamp,1630432972.0
_step,401.0
fold,2.0


0,1
mse,▁
rmse,▁
_runtime,▁▁
_timestamp,▁▁
_step,▁█
fold,▁


FOLD 3
-----------------------------------------


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




MSE is 61.666720656537805
RMSE is 7.852816097206008


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mse,61.66672
rmse,7.85282
_runtime,1548.0
_timestamp,1630434524.0
_step,401.0
fold,3.0


0,1
mse,▁
rmse,▁
_runtime,▁▁
_timestamp,▁▁
_step,▁█
fold,▁


FOLD 4
-----------------------------------------


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




MSE is 61.61926325055153
RMSE is 7.849793834907484


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mse,61.61926
rmse,7.84979
_runtime,1533.0
_timestamp,1630436061.0
_step,401.0
fold,4.0


0,1
mse,▁
rmse,▁
_runtime,▁▁
_timestamp,▁▁
_step,▁█
fold,▁


In [21]:

#     dump(preds, f"./preds/{config_rn['name']}/xgboost_fold{fold}_preds.joblib")

# Inference

In [22]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)

In [23]:
test_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
250000,0.812665,15,-1.23912,-0.893251,295.577,15.8712,23.0436,0.942256,29.898,1.11394,...,0.446389,-422.332,-1.4463,1.69075,1.0593,-3.01057,1.94664,0.52947,1.38695,8.78767
250001,0.190344,131,-0.501361,0.801921,64.8866,3.09703,344.805,0.807194,38.4219,1.09695,...,0.377179,10352.2,21.0627,1.84351,0.251895,4.44057,1.90309,0.248534,0.863881,11.7939
250002,0.919671,19,-0.057382,0.901419,11961.2,16.3965,273.24,-0.0033,37.94,1.15222,...,0.99014,3224.02,-2.25287,1.551,-0.559157,17.8386,1.83385,0.931796,2.33687,9.054
250003,0.860985,19,-0.549509,0.471799,7501.6,2.80698,71.0817,0.792136,0.395235,1.20157,...,1.39688,9689.76,14.7715,1.4139,0.329272,0.802437,2.23251,0.893348,1.35947,4.84833
250004,0.313229,89,0.588509,0.167705,2931.26,4.34986,1.57187,1.1183,7.75463,1.16807,...,0.862502,2693.35,44.1805,1.5802,-0.191021,26.253,2.68238,0.361923,1.5328,3.7066


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [24]:
features = [x for x in test_df.columns if x != 'loss']
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [25]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [26]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [27]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [28]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [29]:
# X_test_final = X_test_final[features[1:]]
X_test_final = X_test

# Scaling
Now, going to scale using `MaxAbsScaler`

In [30]:
scaler = config['scaler']()
X_test_scaled = scaler.fit_transform(X_test_final)
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [31]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# # scaling (i.e. normalizing)
# scaler = config['scaler']()
# X_train_s = scaler.fit_transform(X_train)
# X_test_s = scaler.fit_transform(X_test)

# # selecting features
# selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                       k=config['k_best'])
# X_train_fs = selector.fit_transform(X_train_s, y_train)
# X_test_fs = X_test_s[:, selector.get_support()]

# model = XGBRegressor(
#     tree_method=config['tree_method'],
#     booster=config['booster'],
#     n_estimators=config['n_estimators'], 
#     max_depth=config['max_depth'],
#     learning_rate=config['learning_rate'], 
#     test_size=config['test_size'],
#     subsample=config['subsample'],
#     random_state=config['random_state'],
#     n_jobs=config['n_jobs'], 
#     verbosity=config['verbosity'], 
# )
# #     wandb.log({'params': model.get_params()}) # logging model parameters
# model.fit(X_train_fs, y_train)#, callbacks=[wandb.xgboost.wandb_callback()])

In [32]:
models

{0: XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1522, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='auto', validate_parameters=1, verbosity=1),
 1: XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1522, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_s

Now, iterate over the dict containing the models trained on the 5 folds, and store the predictions in a new dict `preds`
**OR**
load from a directory.

In [33]:
# loaded_models = {}
# saved_models_path = Path('/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/models/inference_ensemble_20210828_204126_5folds/')
# for fold in range(5):
#     loaded_models[fold] = load(filename=Path(saved_models_path/f'xgboost_fold{fold}_model.joblib'))

In [34]:
# models = loaded_models

In [35]:
preds = {}
for fold in models.keys():
    preds[fold] = models[fold].predict(X_test_scaled)



In [36]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [37]:
type(preds[0])

numpy.ndarray

In [38]:
final_preds = (preds[0] + preds[1] + preds[2] + preds[3] + preds[4]) / 5

In [39]:
final_preds[:10]

array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
       9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32)

In [40]:
sample_df.loc[:, 'loss'] = final_preds

In [41]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.00794
1,250001,4.532623
2,250002,7.85575
3,250003,7.124434
4,250004,7.444796


In [42]:
sample_df.to_csv('XGBoost_ensemble_20210831_no_feature_gen.csv', index=False)

# Ensembling with CatBoost

In [46]:
catboost_models = {}
saved_models_path = Path('/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/models/CatBoost_ensemble_20210831_144245_5folds/')
for fold in range(5):
    catboost_models[fold] = load(filename=Path(saved_models_path/f'catboost_fold{fold}_model.joblib'))

In [47]:
catboost_models

{0: <catboost.core.CatBoostRegressor at 0x7f1b154ecfa0>,
 1: <catboost.core.CatBoostRegressor at 0x7f1b1548a880>,
 2: <catboost.core.CatBoostRegressor at 0x7f1b154ec0a0>,
 3: <catboost.core.CatBoostRegressor at 0x7f1b1548ac40>,
 4: <catboost.core.CatBoostRegressor at 0x7f1b154ecdf0>}

In [48]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_scaled)

In [50]:
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [51]:
ensemble_preds = 0.6 * final_catboost_preds + 0.4 * final_preds

In [54]:
ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.40583658, 4.58774964, 8.32465697, 7.18375788, 7.13135284,
        9.67367649, 9.96252577, 5.89393404, 7.22270917, 7.53612671]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [58]:
final_ensemble_preds = 0.65 * final_catboost_preds + 0.35 * final_preds

In [59]:
final_ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.4389943 , 4.5923435 , 8.36373235, 7.18870129, 7.10523255,
        9.67708804, 9.96419843, 5.87235472, 7.23656006, 7.52442615]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [60]:
sample_df.loc[:, 'loss'] = final_ensemble_preds

In [61]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.438994
1,250001,4.592343
2,250002,8.363732
3,250003,7.188701
4,250004,7.105233


In [62]:
sample_df.to_csv('XGBoost0.35-Catboost0.65_ensemble_20210831_no_feature_gen.csv', index=False)

# Experiment - fitting model on full training set

In [36]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_s = scaler.fit_transform(X)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_fs = selector.fit_transform(X_s, y)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_fs, y)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [37]:
y_test_preds = model.predict(X_test_fs)



In [38]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [39]:
sample_df.loc[:, 'loss'] = y_test_preds

In [40]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.027956
1,250001,4.305676
2,250002,7.300106
3,250003,6.988875
4,250004,7.316631


In [41]:
sample_df.to_csv('202108241211_XGBoost_fullset.csv', index=False)