# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = False
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"ensemble_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [8]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "scaler": MinMaxScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": None,
    "imputer": None,
    "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [9]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
features = [x for x in df.columns if x != 'claim']
X = df[features]
y = df.claim

exmodel_config['feature_count'] = len(features)
exmodel_config['feature_generator'] = None

# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
# X = pd.read_feather(datapath/'X_StandardScaled_KNNImputed_5NN.feather') 
# y = load(datapath/'y.joblib')    
# X.index.name = 'id'
# y.index.name = 'id'

In [11]:
scaler = exmodel_config['scaler']()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [12]:
X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,0.457457,0.043413,0.191647,0.070959,0.527183,0.105864,0.063687,0.102258,0.134102,0.236467,...,0.131467,0.871211,0.343019,0.120557,0.261928,0.142026,0.310088,0.029364,0.040935,0.403801
1,0.443849,0.592256,0.433849,0.255789,0.830794,0.498215,0.344463,0.077,0.434246,0.324546,...,0.874035,0.456876,0.907318,0.114198,0.413351,0.136327,0.347585,0.049469,0.167787,0.150209
2,0.580343,0.022422,0.210943,0.252679,0.830402,0.540336,0.446403,0.200918,0.100675,0.376975,...,0.208734,0.931311,0.216179,0.113851,0.467008,0.20903,0.27626,0.027749,0.313531,0.192805
3,0.534915,0.048887,0.208346,0.076533,0.937195,0.845347,0.059686,0.148869,0.106339,0.235366,...,0.026189,0.660647,0.417911,0.116023,0.137909,0.14297,0.278298,0.026798,0.390996,0.230099
4,0.470978,0.970095,0.190185,0.27106,0.626428,0.516054,0.489029,0.04897,0.264354,0.236982,...,0.831717,0.858063,0.292096,0.117449,0.351838,0.141167,0.588661,0.026758,,0.131866


In [13]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: claim, dtype: int64

### Model Config

In [14]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
    
    # library-specific config
    if library in ['xgboost', 'lightgbm']:
        config['n_jobs'] = -1
        
    # best params per sweep `icac24c5`, generated from notebook `sweep_20210905.ipynb`
    # runtime per fold should be around 12m 38s
    # should get auc of 0.7434 on the random_state=42 holdout
    # haven't yet tried dart
    if library == 'xgboost':
        config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
        config['n_estimators'] = 902
        config['learning_rate'] = 0.0304
        config['max_depth'] = 3
        config['reg_alpha'] = 0.863
        config['reg_lambda'] = 2.442
        config['subsample'] = 0.8627

    # best params per sweep `sjghewf0`, generated from notebook `sweep_lightgbm_20210907`
    # run name `sweep_lightgbm_20210907_195641`
    # runtime per fold should be around 39s
    # should get an auc of 0.7435 on random_state=42 holdout
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default
        config['n_estimators'] = 1286
        config['learning_rate'] = 0.03221
        config['max_depth'] = 2
        config['reg_alpha'] = 0.4687
        config['reg_lambda'] = 0.1763
        config['subsample'] = 0.6621
        

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 2000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['baseline'],
    'notes': "Initial runs of each model-type, with sane defaults.",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [16]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [17]:
# X.columns

In [18]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [19]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [20]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [21]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [22]:
# X = X[features[1:]]

# Training

In [27]:
def train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
#             tree_method=config['tree_method'],
#             booster=config['booster'],
            max_depth=model_config['max_depth'],
            learning_rate=model_config['learning_rate'], 
            subsample=model_config['subsample'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            tree_method=model_config['tree_method'],
            n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
            eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
        ) 
        model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    y_pred = model.predict(X_valid)
#     mse = mean_squared_error(y_valid, y_pred)
#     rmse = math.sqrt(abs(mse))
    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model
    

In [28]:
def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    if exmodel_config['kfolds'] == 1:
        print("Proceeding with holdout")
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=exmodel_config['test_size'], 
                                                      random_state=random_state,
                                                     )
        model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
        wandb.finish()
        
    else:
        X, y = X.to_numpy(), y.to_numpy()
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
        models = {}
        model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
        (model_path).mkdir(exist_ok=True)
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
            if fold < start_fold:
                continue
            else:
                print(f"FOLD {fold}")
                print("---------------------------------------------------")
                X_train, X_valid = X[train_ids], X[valid_ids]
                y_train, y_valid = y[train_ids], y[valid_ids]
                model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
                wandb.log({'fold': fold})
                models[fold] = model
                dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
                wandb.finish()
        return models
        

# Interface

## Runs

In [None]:
library = 'xgboost'
exmodel_config['library'] = library
model_config = model_configurator(library)
xgboost_models = cross_validation(model_config)

FOLD 0
---------------------------------------------------


VBox(children=(Label(value=' 0.16MB of 0.16MB uploaded (0.06MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…



In [None]:
# for scaler in [StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler]:
#     exmodel_config['scaler'] = scaler
#     scaler = scaler()
#     X_scaled = scaler.fit_transform(X)
#     X = pd.DataFrame(X_scaled, columns=X.columns)
#     exmodel_config['library'] = 'lightgbm'
#     model_config = model_configurator('lightgbm')
#     cross_validation(model_config)

In [None]:
# library = 'xgboost'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# xgboost_models = cross_validation(model_config)

library = 'lightgbm'
exmodel_config['library'] = library
model_config = model_configurator(library)
lightgbm_models = cross_validation(model_config)

library = 'catboost'
gpu_available = True
exmodel_config['library'] = library
model_config = model_configurator(library)
catboost_models = cross_validation(model_config)

In [None]:
# # this loads models if you need to (or forgot to save them on training above)
# xgboost_models = {}
# xgboost_models_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/baseline_20210905a_152521_5folds/xgboost/')
# for fold in range(5):
#     xgboost_models[fold] = load(xgboost_models_path/f'xgboost_fold{fold}_model.joblib')

# Inference

In [None]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)
# test_df.to_feather(datapath/'test.feather') # issue with index being non-default; fix later
# test_df = pd.read_feather(datapath/'test.feather')

In [None]:
test_df.head()

(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [None]:
features = [x for x in test_df.columns if x != 'claim']
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [None]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [None]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

# Scaling
Now, going to scale using `MaxAbsScaler`

In [None]:
scaler = exmodel_config['scaler']()
X_test_scaled = scaler.fit_transform(X_test)
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [None]:
xgboost_preds = {}
for fold in xgboost_models.keys():
    xgboost_preds[fold] = xgboost_models[fold].predict(X_test_scaled)

In [None]:
lightgbm_preds = {}
for fold in lightgbm_models.keys():
    lightgbm_preds[fold] = lightgbm_models[fold].predict(X_test_scaled)

In [None]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_scaled)

In [None]:
preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds/")
preds_path.mkdir(exist_ok=True)

for library in ['xgboost', 'lightgbm', 'catboost']:
    (preds_path/library).mkdir(exist_ok=True)

In [None]:
dump(xgboost_preds, Path(preds_path/'xgboost/xgboost_preds_dict.joblib'))

In [None]:
dump(lightgbm_preds, Path(preds_path/'lightgbm/lightgbm_preds_dict.joblib'))

In [None]:
dump(catboost_preds, Path(preds_path/'catboost/catboost_preds_dict.joblib'))

In [None]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [None]:
final_xgboost_preds = (xgboost_preds[0] + xgboost_preds[1] + xgboost_preds[2] + xgboost_preds[3] + xgboost_preds[4]) / 5
final_lightgbm_preds = (lightgbm_preds[0] + lightgbm_preds[1] + lightgbm_preds[2] + lightgbm_preds[3] + lightgbm_preds[4]) / 5
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [None]:
# print(final_xgboost_preds[:10])
# print(final_lightgbm_preds[:10])
# print(final_catboost_preds[:10])

In [None]:
sample_df.loc[:, 'claim'] = final_xgboost_preds

In [None]:
# sample_df.head()

In [None]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_xgboost-mean.csv", index=False)

In [None]:
sample_df.loc[:, 'claim'] = final_lightgbm_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_lightgbm-mean.csv", index=False)

In [None]:
sample_df.loc[:, 'claim'] = final_catboost_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_catboost-mean.csv", index=False)

In [None]:
ensemble_preds = (final_xgboost_preds + final_lightgbm_preds + final_catboost_preds) / 3

In [None]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_ensemble-equal_model_and_fold_weight_mean.csv", index=False)

In [None]:
ensemble_preds = 0.4*final_xgboost_preds + 0.3*final_lightgbm_preds + 0.3*final_catboost_preds

In [None]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_ensemble-0.4xgboost_0.3lightgbm_0.3catboost-equal_fold_weight_mean.csv", index=False)

# Ensembling with CatBoost

In [46]:
catboost_models = {}
saved_models_path = Path('/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/models/CatBoost_ensemble_20210831_144245_5folds/')
for fold in range(5):
    catboost_models[fold] = load(filename=Path(saved_models_path/f'catboost_fold{fold}_model.joblib'))

In [47]:
catboost_models

{0: <catboost.core.CatBoostRegressor at 0x7f1b154ecfa0>,
 1: <catboost.core.CatBoostRegressor at 0x7f1b1548a880>,
 2: <catboost.core.CatBoostRegressor at 0x7f1b154ec0a0>,
 3: <catboost.core.CatBoostRegressor at 0x7f1b1548ac40>,
 4: <catboost.core.CatBoostRegressor at 0x7f1b154ecdf0>}

In [48]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict(X_test_scaled)

In [50]:
final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [51]:
ensemble_preds = 0.6 * final_catboost_preds + 0.4 * final_preds

In [54]:
ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.40583658, 4.58774964, 8.32465697, 7.18375788, 7.13135284,
        9.67367649, 9.96252577, 5.89393404, 7.22270917, 7.53612671]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [58]:
final_ensemble_preds = 0.65 * final_catboost_preds + 0.35 * final_preds

In [59]:
final_ensemble_preds[:10], final_catboost_preds[:10], final_preds[:10]

(array([8.4389943 , 4.5923435 , 8.36373235, 7.18870129, 7.10523255,
        9.67708804, 9.96419843, 5.87235472, 7.23656006, 7.52442615]),
 array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
        9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341]),
 array([8.00794  , 4.5326233, 7.85575  , 7.1244345, 7.444796 , 9.632734 ,
        9.9424515, 6.1528835, 7.0564985, 7.6765313], dtype=float32))

In [60]:
sample_df.loc[:, 'loss'] = final_ensemble_preds

In [61]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.438994
1,250001,4.592343
2,250002,8.363732
3,250003,7.188701
4,250004,7.105233


In [62]:
sample_df.to_csv('XGBoost0.35-Catboost0.65_ensemble_20210831_no_feature_gen.csv', index=False)

# Experiment - fitting model on full training set

In [36]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_s = scaler.fit_transform(X)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_fs = selector.fit_transform(X_s, y)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_fs, y)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [37]:
y_test_preds = model.predict(X_test_fs)



In [38]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [39]:
sample_df.loc[:, 'loss'] = y_test_preds

In [40]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.027956
1,250001,4.305676
2,250002,7.300106
3,250003,6.988875
4,250004,7.316631


In [41]:
sample_df.to_csv('202108241211_XGBoost_fullset.csv', index=False)