# Experiment
I want to see how a model fit on the full dataset performs relative to models fit on k-folds / their composition

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = True
libraries = ['catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"experimental_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import KNNImputer, SimpleImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [8]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "scaler": StandardScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": SimpleImputer(strategy='median', add_indicator=True),
    "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [9]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
# df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
# features = [x for x in df.columns if x != 'claim']
# X = df[features]
# y = df.claim



# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
X = pd.read_feather(datapath/'X_NaNcounts_imputed-Median-wIndicators-StandardScaled.feather') 
y = load(datapath/'y.joblib')    
X.index.name = 'id'
y.index.name = 'id'

exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_generator'] = None

In [11]:
# scaler = exmodel_config['scaler']()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

In [12]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [13]:
y.head()

id
0    1
1    0
2    1
3    1
4    1
Name: claim, dtype: int64

### Model Config

In [14]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
    
    # library-specific config
    if library in ['xgboost', 'lightgbm']:
        config['n_jobs'] = -1
        
    # best params per sweep `icac24c5`, generated from notebook `sweep_20210905.ipynb`
    # runtime per fold should be around 12m 38s
    # should get auc of 0.7434 on the random_state=42 holdout
    # haven't yet tried dart
    if library == 'xgboost':
#         config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
        
        # comment out the below to get defaults
        config['n_estimators'] = 902
        config['learning_rate'] = 0.0304
        config['max_depth'] = 3
        config['reg_alpha'] = 0.863
        config['reg_lambda'] = 2.442
        config['subsample'] = 0.8627

    # best params per sweep `sjghewf0`, generated from notebook `sweep_lightgbm_20210907`
    # run name `sweep_lightgbm_20210907_195641`
    # runtime per fold should be around 39s
    # should get an auc of 0.7435 on random_state=42 holdout
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default
        
        # comment out the below for defaults
        config['n_estimators'] = 1286
        config['learning_rate'] = 0.03221
        config['max_depth'] = 2
        config['reg_alpha'] = 0.4687
        config['reg_lambda'] = 0.1763
        config['subsample'] = 0.6621
        

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 2000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['experiment'],
    'notes': "I want to see how a model fit on the full dataset performs relative to models fit on k-folds / their composition",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [16]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [17]:
# X.columns

In [18]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [19]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [20]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [21]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [22]:
# X = X[features[1:]]

# Training

In [23]:
def train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
            max_depth=model_config['max_depth'],
            learning_rate=model_config['learning_rate'], 
            subsample=model_config['subsample'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
        ) 
        model.fit(X_train, y_train)
    y_train_pred = model.predict_proba(X_train)[:,1]
    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    y_pred = model.predict_proba(X_valid)[:,1]
#     mse = mean_squared_error(y_valid, y_pred)
#     rmse = math.sqrt(abs(mse))
    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model
    

In [24]:
def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    if exmodel_config['kfolds'] == 1:
        print("Proceeding with holdout")
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=exmodel_config['test_size'], 
                                                      random_state=random_state,
                                                     )
        model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
        wandb.finish()
        
    else:
        X, y = X.to_numpy(), y.to_numpy()
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
        models = {}
        model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
        (model_path).mkdir(exist_ok=True)
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
            if fold < start_fold:
                continue
            else:
                print(f"FOLD {fold}")
                print("---------------------------------------------------")
                X_train, X_valid = X[train_ids], X[valid_ids]
                y_train, y_valid = y[train_ids], y[valid_ids]
                model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
                wandb.log({'fold': fold})
                models[fold] = model
                dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
                wandb.finish()
        return models
        

# Interface

## Runs

In [25]:
# library = 'xgboost'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# xgboost_models = cross_validation(model_config)

In [26]:
# for scaler in [StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler]:
#     exmodel_config['scaler'] = scaler
#     scaler = scaler()
#     X_scaled = scaler.fit_transform(X)
#     X = pd.DataFrame(X_scaled, columns=X.columns)
#     exmodel_config['library'] = 'lightgbm'
#     model_config = model_configurator('lightgbm')
#     cross_validation(model_config)

In [27]:
# library = 'xgboost'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# xgboost_models = cross_validation(model_config)

# library = 'lightgbm'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# lightgbm_models = cross_validation(model_config)

In [28]:
library = 'catboost'
gpu_available = True
exmodel_config['library'] = library
model_config = model_configurator(library)
catboost_models = cross_validation(model_config)

exmodel_config['kfolds'] = 1
exmodel_config['cross_val_strategy'] = None


catboost_fulldataset_model = train(X_train=X, X_valid=X, y_train=y, y_valid=y, model_config=model_config)


FOLD 0
---------------------------------------------------


[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6881616	total: 12.5ms	remaining: 25.1s
1:	learn: 0.6833115	total: 30ms	remaining: 30s
2:	learn: 0.6785920	total: 81.6ms	remaining: 54.3s
3:	learn: 0.6740076	total: 111ms	remaining: 55.6s
4:	learn: 0.6695552	total: 132ms	remaining: 52.7s
5:	learn: 0.6652218	total: 147ms	remaining: 49s
6:	learn: 0.6610043	total: 163ms	remaining: 46.5s
7:	learn: 0.6569107	total: 179ms	remaining: 44.5s
8:	learn: 0.6529261	total: 196ms	remaining: 43.5s
9:	learn: 0.6490535	total: 214ms	remaining: 42.6s
10:	learn: 0.6452922	total: 231ms	remaining: 41.9s
11:	learn: 0.6416302	total: 254ms	remaining: 42.1s
12:	learn: 0.6380721	total: 270ms	remaining: 41.3s
13:	learn: 0.6346165	total: 288ms	remaining: 40.8s
14:	learn: 0.6312586	total: 324ms	remaining: 42.9s
15:	learn: 0.6279916	total: 363ms	remaining: 45s
16:	learn: 0.6248175	total: 390ms	remaining: 45.5s
17:	learn: 0.6217344	total: 417ms	remaining: 45.9s
18:	learn: 0.6187336	total: 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50549
train_auc,0.82184
_runtime,149
_timestamp,1631324365
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 1
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6881817	total: 63.2ms	remaining: 2m 6s
1:	learn: 0.6833509	total: 102ms	remaining: 1m 41s
2:	learn: 0.6786523	total: 157ms	remaining: 1m 44s
3:	learn: 0.6740871	total: 181ms	remaining: 1m 30s
4:	learn: 0.6696543	total: 202ms	remaining: 1m 20s
5:	learn: 0.6653416	total: 223ms	remaining: 1m 13s
6:	learn: 0.6611440	total: 249ms	remaining: 1m 10s
7:	learn: 0.6570663	total: 271ms	remaining: 1m 7s
8:	learn: 0.6530970	total: 294ms	remaining: 1m 5s
9:	learn: 0.6492383	total: 315ms	remaining: 1m 2s
10:	learn: 0.6454920	total: 336ms	remaining: 1m
11:	learn: 0.6418457	total: 360ms	remaining: 59.7s
12:	learn: 0.6382987	total: 378ms	remaining: 57.8s
13:	learn: 0.6348579	total: 393ms	remaining: 55.8s
14:	learn: 0.6315131	total: 414ms	remaining: 54.7s
15:	learn: 0.6282582	total: 433ms	remaining: 53.7s
16:	learn: 0.6250966	total: 446ms	remaining: 52s
17:	learn: 0.6220252	total: 460ms	remaining: 50.7s
18:	learn: 0.6190353	

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50615
train_auc,0.82136
_runtime,156
_timestamp,1631324528
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 2
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6881698	total: 40.8ms	remaining: 1m 21s
1:	learn: 0.6833273	total: 88.6ms	remaining: 1m 28s
2:	learn: 0.6786166	total: 110ms	remaining: 1m 13s
3:	learn: 0.6740414	total: 128ms	remaining: 1m 3s
4:	learn: 0.6695973	total: 144ms	remaining: 57.4s
5:	learn: 0.6652721	total: 167ms	remaining: 55.3s
6:	learn: 0.6610626	total: 188ms	remaining: 53.6s
7:	learn: 0.6569758	total: 205ms	remaining: 51.1s
8:	learn: 0.6529995	total: 221ms	remaining: 49s
9:	learn: 0.6491365	total: 249ms	remaining: 49.5s
10:	learn: 0.6453828	total: 263ms	remaining: 47.6s
11:	learn: 0.6417263	total: 284ms	remaining: 47s
12:	learn: 0.6381748	total: 301ms	remaining: 45.9s
13:	learn: 0.6347256	total: 320ms	remaining: 45.5s
14:	learn: 0.6313734	total: 338ms	remaining: 44.8s
15:	learn: 0.6281127	total: 357ms	remaining: 44.3s
16:	learn: 0.6249438	total: 370ms	remaining: 43.1s
17:	learn: 0.6218664	total: 383ms	remaining: 42.2s
18:	learn: 0.6188695	t

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50582
train_auc,0.82175
_runtime,150
_timestamp,1631324686
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 3
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6881803	total: 14.8ms	remaining: 29.5s
1:	learn: 0.6833473	total: 29.7ms	remaining: 29.7s
2:	learn: 0.6786455	total: 53.2ms	remaining: 35.4s
3:	learn: 0.6740773	total: 128ms	remaining: 1m 3s
4:	learn: 0.6696421	total: 152ms	remaining: 1m
5:	learn: 0.6653255	total: 168ms	remaining: 55.9s
6:	learn: 0.6611226	total: 190ms	remaining: 54.2s
7:	learn: 0.6570439	total: 207ms	remaining: 51.6s
8:	learn: 0.6530731	total: 228ms	remaining: 50.4s
9:	learn: 0.6492125	total: 249ms	remaining: 49.6s
10:	learn: 0.6454650	total: 267ms	remaining: 48.3s
11:	learn: 0.6418185	total: 283ms	remaining: 46.9s
12:	learn: 0.6382722	total: 301ms	remaining: 45.9s
13:	learn: 0.6348289	total: 315ms	remaining: 44.7s
14:	learn: 0.6314826	total: 329ms	remaining: 43.5s
15:	learn: 0.6282271	total: 345ms	remaining: 42.7s
16:	learn: 0.6250649	total: 362ms	remaining: 42.2s
17:	learn: 0.6219925	total: 375ms	remaining: 41.3s
18:	learn: 0.6190054	to

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50611
train_auc,0.82155
_runtime,149
_timestamp,1631324840
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


FOLD 4
---------------------------------------------------


CatBoost, therefore no WandB callback.
Learning rate set to 0.012572
0:	learn: 0.6881874	total: 58.8ms	remaining: 1m 57s
1:	learn: 0.6833631	total: 113ms	remaining: 1m 53s
2:	learn: 0.6786679	total: 136ms	remaining: 1m 30s
3:	learn: 0.6741080	total: 203ms	remaining: 1m 41s
4:	learn: 0.6696791	total: 222ms	remaining: 1m 28s
5:	learn: 0.6653677	total: 241ms	remaining: 1m 20s
6:	learn: 0.6611738	total: 261ms	remaining: 1m 14s
7:	learn: 0.6571024	total: 281ms	remaining: 1m 9s
8:	learn: 0.6531391	total: 296ms	remaining: 1m 5s
9:	learn: 0.6492860	total: 314ms	remaining: 1m 2s
10:	learn: 0.6455441	total: 330ms	remaining: 59.6s
11:	learn: 0.6419017	total: 344ms	remaining: 56.9s
12:	learn: 0.6383626	total: 359ms	remaining: 54.9s
13:	learn: 0.6349256	total: 376ms	remaining: 53.4s
14:	learn: 0.6315842	total: 392ms	remaining: 51.8s
15:	learn: 0.6283345	total: 418ms	remaining: 51.8s
16:	learn: 0.6251767	total: 430ms	remaining: 50.2s
17:	learn: 0.6221084	total: 451ms	remaining: 49.7s
18:	learn: 0.61

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50626
train_auc,0.82132
_runtime,152
_timestamp,1631324997
_step,3
nan_mode,Min
gpu_ram_part,0.95
eval_metric,Logloss
iterations,2000
leaf_estimation_method,Newton


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
gpu_ram_part,▁
iterations,▁
bayesian_matrix_reg,▁
l2_leaf_reg,▁
random_strength,▁


CatBoost, therefore no WandB callback.
Learning rate set to 0.012419
0:	learn: 0.6882358	total: 54.5ms	remaining: 1m 48s
1:	learn: 0.6834559	total: 87ms	remaining: 1m 26s
2:	learn: 0.6788047	total: 138ms	remaining: 1m 32s
3:	learn: 0.6742848	total: 204ms	remaining: 1m 41s
4:	learn: 0.6698927	total: 251ms	remaining: 1m 39s
5:	learn: 0.6656181	total: 269ms	remaining: 1m 29s
6:	learn: 0.6614562	total: 342ms	remaining: 1m 37s
7:	learn: 0.6574149	total: 421ms	remaining: 1m 44s
8:	learn: 0.6534790	total: 462ms	remaining: 1m 42s
9:	learn: 0.6496506	total: 486ms	remaining: 1m 36s
10:	learn: 0.6459331	total: 528ms	remaining: 1m 35s
11:	learn: 0.6423135	total: 554ms	remaining: 1m 31s
12:	learn: 0.6387952	total: 568ms	remaining: 1m 26s
13:	learn: 0.6353756	total: 618ms	remaining: 1m 27s
14:	learn: 0.6320511	total: 682ms	remaining: 1m 30s
15:	learn: 0.6288163	total: 737ms	remaining: 1m 31s
16:	learn: 0.6256722	total: 765ms	remaining: 1m 29s
17:	learn: 0.6226163	total: 821ms	remaining: 1m 30s
18:	l

In [29]:
# # this loads models if you need to (or forgot to save them on training above)
# xgboost_models = {}
# xgboost_models_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/baseline_20210905a_152521_5folds/xgboost/')
# for fold in range(5):
#     xgboost_models[fold] = load(xgboost_models_path/f'xgboost_fold{fold}_model.joblib')

# Inference

In [41]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)
# test_df.to_feather(datapath/'test.feather') # issue with index being non-default; fix later
# test_df = pd.read_feather(datapath/'test.feather')

In [42]:
test_df.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,6221.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,934.43,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,131.81,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,-26.473,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,5999.4,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [43]:
features = [x for x in test_df.columns if x != 'claim']
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [33]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [34]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [35]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [36]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [37]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

# Test set preprocessing


In [45]:
X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [46]:
imputer = SimpleImputer(strategy='median', add_indicator=True)
X_test_imputed_np = imputer.fit_transform(X_test)

In [50]:
X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [52]:
scaler = exmodel_config['scaler']()
X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [54]:
xgboost_preds = {}
for fold in xgboost_models.keys():
    xgboost_preds[fold] = xgboost_models[fold].predict(X_test_imputed_scaled)



In [55]:
lightgbm_preds = {}
for fold in lightgbm_models.keys():
    lightgbm_preds[fold] = lightgbm_models[fold].predict(X_test_imputed_scaled)

In [56]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_imputed_scaled)

In [57]:
preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds/")
preds_path.mkdir(exist_ok=True)

for library in ['xgboost', 'lightgbm', 'catboost']:
    (preds_path/library).mkdir(exist_ok=True)

In [58]:
dump(xgboost_preds, Path(preds_path/'xgboost/xgboost_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/baseline_20210909_114059_5folds/xgboost/xgboost_preds_dict.joblib']

In [59]:
dump(lightgbm_preds, Path(preds_path/'lightgbm/lightgbm_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/baseline_20210909_114059_5folds/lightgbm/lightgbm_preds_dict.joblib']

In [60]:
dump(catboost_preds, Path(preds_path/'catboost/catboost_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/baseline_20210909_114059_5folds/catboost/catboost_preds_dict.joblib']

In [61]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [62]:
final_xgboost_preds = (xgboost_preds[0] + xgboost_preds[1] + xgboost_preds[2] + xgboost_preds[3] + xgboost_preds[4]) / 5
final_lightgbm_preds = (lightgbm_preds[0] + lightgbm_preds[1] + lightgbm_preds[2] + lightgbm_preds[3] + lightgbm_preds[4]) / 5
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [None]:
print(final_xgboost_preds[:10])
print(final_lightgbm_preds[:10])
print(final_catboost_preds[:10])

In [63]:
sample_df.loc[:, 'claim'] = final_xgboost_preds

In [64]:
sample_df.head()

Unnamed: 0,id,claim
0,957919,1.0
1,957920,0.0
2,957921,1.0
3,957922,0.0
4,957923,0.0


In [65]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [66]:
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_xgboost-mean.csv", index=False)

In [67]:
sample_df.loc[:, 'claim'] = final_lightgbm_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_lightgbm-mean.csv", index=False)

In [68]:
sample_df.loc[:, 'claim'] = final_catboost_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_catboost-mean.csv", index=False)

In [69]:
ensemble_preds = (final_xgboost_preds + final_lightgbm_preds + final_catboost_preds) / 3

In [70]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_ensemble-equal_model_and_fold_weight_mean.csv", index=False)

In [71]:
ensemble_preds = 0.4*final_xgboost_preds + 0.3*final_lightgbm_preds + 0.3*final_catboost_preds

In [72]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_ensemble-0.4xgboost_0.3lightgbm_0.3catboost-equal_fold_weight_mean.csv", index=False)

# Ensembling with CatBoost

In [46]:
catboost_models = {}
saved_models_path = Path('/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/models/CatBoost_ensemble_20210831_144245_5folds/')
for fold in range(5):
    catboost_models[fold] = load(filename=Path(saved_models_path/f'catboost_fold{fold}_model.joblib'))

In [47]:
catboost_models

{0: <catboost.core.CatBoostRegressor at 0x7f1b154ecfa0>,
 1: <catboost.core.CatBoostRegressor at 0x7f1b1548a880>,
 2: <catboost.core.CatBoostRegressor at 0x7f1b154ec0a0>,
 3: <catboost.core.CatBoostRegressor at 0x7f1b1548ac40>,
 4: <catboost.core.CatBoostRegressor at 0x7f1b154ecdf0>}

In [48]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_scaled)

In [50]:
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [51]:
ensemble_preds = 0.6 * final_catboost_preds + 0.4 * final_preds

In [54]:
ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.40583658, 4.58774964, 8.32465697, 7.18375788, 7.13135284,
        9.67367649, 9.96252577, 5.89393404, 7.22270917, 7.53612671]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [58]:
final_ensemble_preds = 0.65 * final_catboost_preds + 0.35 * final_preds

In [59]:
final_ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.4389943 , 4.5923435 , 8.36373235, 7.18870129, 7.10523255,
        9.67708804, 9.96419843, 5.87235472, 7.23656006, 7.52442615]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [60]:
sample_df.loc[:, 'loss'] = final_ensemble_preds

In [61]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.438994
1,250001,4.592343
2,250002,8.363732
3,250003,7.188701
4,250004,7.105233


In [62]:
sample_df.to_csv('XGBoost0.35-Catboost0.65_ensemble_20210831_no_feature_gen.csv', index=False)

# Experiment - fitting model on full training set

In [36]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_s = scaler.fit_transform(X)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_fs = selector.fit_transform(X_s, y)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_fs, y)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [37]:
y_test_preds = model.predict(X_test_fs)



In [38]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [39]:
sample_df.loc[:, 'loss'] = y_test_preds

In [40]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.027956
1,250001,4.305676
2,250002,7.300106
3,250003,6.988875
4,250004,7.316631


In [41]:
sample_df.to_csv('202108241211_XGBoost_fullset.csv', index=False)