Notebook for doing inference only with an ensemble -- experiments are conducted elsewhere. Model is as stated; `MaxAbsScaler` and `SelectKBest(k=80)` seem to be the best options as of 20210824. (Though model hyperparams haven't been fine-tuned with the scaler and the feature selector as of yet.) **And they should be, since performance is down!**


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns
from datetime import datetime

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


In [2]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'XGBoost_ensemble_20210830.ipynb'
config = {
    # model config
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'dart', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 3000, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
    "task_type": "GPU", # for CatBoost only
#     "reg_alpha": 2.8,
#     "reg_lambda": 3.987,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 5,
    'features_created': False,
    'feature_creator': None,
}

config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'kfold', 'scaling', 'feature-creation', 'feature-selection'],
    'notes': "Running most of the parameters from the run j7tlo010 from the sweep b2zv3fsy, the most successful yet; just increasing n_estimators from 400 to 1000 and adding k-fold, feature generation/selection, etc",
}

In [3]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [4]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

# load unaltered dataset
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [5]:
y = df.loss

# Feature Creation and Selection

In [6]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [7]:
# X.columns

In [8]:
# prep features from unaltered dataset
features = [x for x in df.columns if x != 'loss']
X = df[features]

In [9]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [10]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [11]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [12]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [13]:
# X = X[features[1:]]

# Scaling
Now, going to scale using `MaxAbsScaler`

In [14]:
scaler = config['scaler']()
X_scaled = scaler.fit_transform(X)
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

# K-fold Cross-validation

In [15]:
# ACTUALLY probably better to save those as pickles or .npy files; I'll generate them later, regardless
# results = {} # for storing k-fold models' predictions

In [16]:
kfold = KFold(n_splits=config['k_folds'], shuffle=True, random_state=config['random_state'])

In [17]:
def train(X_train, X_valid, y_train, y_valid, config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
#     wandb.init(
#         project="202108_Kaggle_tabular_playground",
#         save_code=True,
#         tags=config_run['tags'],
#         name=config_run['name'],
#         notes=config_run['notes'],
#         config=config)   
        
    # applying hold-out before scaling
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=config['test_size'], 
#                                                           random_state=config['random_state']
#                                                          )
    
    # strictly speaking should do the below, but doing beforehand faster and fine in this context
    # scaling (i.e. normalizing)
#     scaler = config['scaler']()
#     X_train_s = scaler.fit_transform(X_train)
#     X_valid_s = scaler.fit_transform(X_valid)
    
    # selecting features
#     selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                           k=config['k_best'])
#     X_train_fs = selector.fit_transform(X_train_s, y_train)
#     X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    # split the dataset
    model = CatBoostRegressor(
        n_estimators=config['n_estimators'],
        learning_rate=config['learning_rate'],
        max_depth=config['max_depth'],
        task_type=config['task_type'],
#         n_jobs=config['n_jobs'],
#         verbosity=config['verbosity'],
#         subsample=config['subsample'],
        random_state=config['random_state'],
#         bootstrap_type=config['bootstrap_type'],
#         device:config['device']
    ) 

#     model = XGBRegressor(
#         tree_method=config['tree_method'],
#         booster=config['booster'],
#         n_estimators=config['n_estimators'], 
#         max_depth=config['max_depth'],
#         learning_rate=config['learning_rate'], 
# #         test_size=config['test_size'],
#         subsample=config['subsample'],
#         reg_alpha=config['reg_alpha'],
#         reg_lambda=config['reg_lambda'],
#         random_state=config['random_state'],
#         n_jobs=config['n_jobs'], 
#         verbosity=config['verbosity'], 
#     )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    model.fit(X_train, y_train)#, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
#     wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
#     wandb.finish()   
    return model
    

In [18]:
models = {}

In [19]:
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)

In [20]:
for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#     if fold == 0:
#         continue
#     else:
    print(f"FOLD {fold}")
    print("-----------------------------------------")
    X_train, X_valid = X_scaled[train_ids], X_scaled[valid_ids] # requires X to be a numpy.ndarray
    y_train, y_valid = y[train_ids], y[valid_ids]
    model = train(X_train, X_valid, y_train, y_valid, config)
#     wandb.log({'fold': fold})
    models[fold] = model
    dump(model, Path(model_path/f"catboost_fold{fold}_model.joblib"))
#     wandb.finish()

FOLD 0
-----------------------------------------
0:	learn: 7.9418049	total: 8.18ms	remaining: 24.5s
1:	learn: 7.9386857	total: 10.4ms	remaining: 15.6s
2:	learn: 7.9364381	total: 12.9ms	remaining: 12.8s
3:	learn: 7.9343308	total: 20.3ms	remaining: 15.2s
4:	learn: 7.9327322	total: 24.3ms	remaining: 14.5s
5:	learn: 7.9309148	total: 27.3ms	remaining: 13.6s
6:	learn: 7.9294186	total: 30ms	remaining: 12.8s
7:	learn: 7.9284283	total: 32.3ms	remaining: 12.1s
8:	learn: 7.9271795	total: 34.3ms	remaining: 11.4s
9:	learn: 7.9257485	total: 37.4ms	remaining: 11.2s
10:	learn: 7.9246691	total: 39.7ms	remaining: 10.8s
11:	learn: 7.9237980	total: 44.5ms	remaining: 11.1s
12:	learn: 7.9227729	total: 51.8ms	remaining: 11.9s
13:	learn: 7.9217160	total: 60.4ms	remaining: 12.9s
14:	learn: 7.9207026	total: 67.6ms	remaining: 13.5s
15:	learn: 7.9198879	total: 74.9ms	remaining: 14s
16:	learn: 7.9187613	total: 82.1ms	remaining: 14.4s
17:	learn: 7.9177017	total: 89.7ms	remaining: 14.9s
18:	learn: 7.9167916	total: 9

In [21]:

#     dump(preds, f"./preds/{config_rn['name']}/xgboost_fold{fold}_preds.joblib")

# Inference

In [22]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)

In [23]:
test_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
250000,0.812665,15,-1.23912,-0.893251,295.577,15.8712,23.0436,0.942256,29.898,1.11394,...,0.446389,-422.332,-1.4463,1.69075,1.0593,-3.01057,1.94664,0.52947,1.38695,8.78767
250001,0.190344,131,-0.501361,0.801921,64.8866,3.09703,344.805,0.807194,38.4219,1.09695,...,0.377179,10352.2,21.0627,1.84351,0.251895,4.44057,1.90309,0.248534,0.863881,11.7939
250002,0.919671,19,-0.057382,0.901419,11961.2,16.3965,273.24,-0.0033,37.94,1.15222,...,0.99014,3224.02,-2.25287,1.551,-0.559157,17.8386,1.83385,0.931796,2.33687,9.054
250003,0.860985,19,-0.549509,0.471799,7501.6,2.80698,71.0817,0.792136,0.395235,1.20157,...,1.39688,9689.76,14.7715,1.4139,0.329272,0.802437,2.23251,0.893348,1.35947,4.84833
250004,0.313229,89,0.588509,0.167705,2931.26,4.34986,1.57187,1.1183,7.75463,1.16807,...,0.862502,2693.35,44.1805,1.5802,-0.191021,26.253,2.68238,0.361923,1.5328,3.7066


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [24]:
features = [x for x in test_df.columns if x != 'loss']
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [25]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [26]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [27]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [28]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [29]:
# X_test_final = X_test_final[features[1:]]

# Scaling
Now, going to scale using `MaxAbsScaler`

In [30]:
X_test_final = X_test

In [31]:
scaler = config['scaler']()
X_test_scaled = scaler.fit_transform(X_test_final)
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [32]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# # scaling (i.e. normalizing)
# scaler = config['scaler']()
# X_train_s = scaler.fit_transform(X_train)
# X_test_s = scaler.fit_transform(X_test)

# # selecting features
# selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                       k=config['k_best'])
# X_train_fs = selector.fit_transform(X_train_s, y_train)
# X_test_fs = X_test_s[:, selector.get_support()]

# model = XGBRegressor(
#     tree_method=config['tree_method'],
#     booster=config['booster'],
#     n_estimators=config['n_estimators'], 
#     max_depth=config['max_depth'],
#     learning_rate=config['learning_rate'], 
#     test_size=config['test_size'],
#     subsample=config['subsample'],
#     random_state=config['random_state'],
#     n_jobs=config['n_jobs'], 
#     verbosity=config['verbosity'], 
# )
# #     wandb.log({'params': model.get_params()}) # logging model parameters
# model.fit(X_train_fs, y_train)#, callbacks=[wandb.xgboost.wandb_callback()])

In [33]:
models

{0: <catboost.core.CatBoostRegressor at 0x7fa47854b940>,
 1: <catboost.core.CatBoostRegressor at 0x7fa47854b7f0>,
 2: <catboost.core.CatBoostRegressor at 0x7fa47854bb80>,
 3: <catboost.core.CatBoostRegressor at 0x7fa47854b970>,
 4: <catboost.core.CatBoostRegressor at 0x7fa47856ee80>}

Now, iterate over the dict containing the models trained on the 5 folds, and store the predictions in a new dict `preds`
**OR**
load from a directory.

In [34]:
# loaded_models = {}
# saved_models_path = Path('/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/models/inference_ensemble_20210828_204126_5folds/')
# for fold in range(5):
#     loaded_models[fold] = load(filename=Path(saved_models_path/f'xgboost_fold{fold}_model.joblib'))

In [35]:
# models = loaded_models

In [36]:
preds = {}
for fold in models.keys():
    preds[fold] = models[fold].predict(X_test_scaled)

In [37]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [38]:
type(preds[0])

numpy.ndarray

In [40]:
final_preds = (preds[0] + preds[1] + preds[2] + preds[3] + preds[4]) / 5

In [41]:
dump(final_preds, '/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/preds/catboost_5fold_no_feature_gen.joblib')

['/home/sf/Dropbox/code_cloud/python_code/kaggle/tabular_playgrounds/aug2021/preds/catboost_5fold_no_feature_gen.joblib']

In [42]:
final_preds[:10]

array([8.67110053, 4.62450053, 8.6372614 , 7.22330665, 6.92239076,
       9.70097104, 9.97590847, 5.72130089, 7.33351626, 7.44252341])

In [43]:
sample_df.loc[:, 'loss'] = final_preds

In [44]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.671101
1,250001,4.624501
2,250002,8.637261
3,250003,7.223307
4,250004,6.922391


In [45]:
sample_df.to_csv('202108311445_CatBoost_n-est1000per_no-feature-gen.csv', index=False)

# Experiment - fitting model on full training set

In [36]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_s = scaler.fit_transform(X)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_fs = selector.fit_transform(X_s, y)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_fs, y)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [37]:
y_test_preds = model.predict(X_test_fs)



In [38]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [39]:
sample_df.loc[:, 'loss'] = y_test_preds

In [40]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.027956
1,250001,4.305676
2,250002,7.300106
3,250003,6.988875
4,250004,7.316631


In [41]:
sample_df.to_csv('202108241211_XGBoost_fullset.csv', index=False)