Notebook for doing inference only with an ensemble -- experiments are conducted elsewhere. Model is as stated; `MaxAbsScaler` and `SelectKBest(k=80)` seem to be the best options as of 20210824. (Though model hyperparams haven't been fine-tuned with the scaler and the feature selector as of yet.) **And they should be, since performance is down!**


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns
from datetime import datetime

from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


In [2]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'feature_selection_test_20210827.ipynb'
config = {
    # model config
    "wandb": False,
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 50, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 1,
    'features_created': False,
    'feature_creator': None,
}

config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'kfold', 'scaling', 'baseline'],
    'notes': "Control run for 50-estimator run with features created with PolynomialFeatures",
}

In [3]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [4]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

# load unaltered dataset
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [5]:
y = df.loss

In [6]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [7]:
# X.columns

In [8]:
# prep features from unaltered dataset
features = [x for x in df.columns if x != 'loss']
X = df[features]

# Feature Creation

In [9]:
poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
X_poly = poly.fit_transform(X)

In [10]:
X_poly_names = poly.get_feature_names(X.columns)

In [11]:
X = pd.DataFrame(X_poly, columns=X_poly_names)

# Scaling
Now, going to scale using `MaxAbsScaler`

In [12]:
scaler = config['scaler']()
X = scaler.fit_transform(X)

# Training Function

In [13]:
def train(X_train, X_valid, y_train, y_valid, config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    if config['wandb']:
        wandb.init(
            project="202108_Kaggle_tabular_playground",
            save_code=True,
            tags=config_run['tags'],
            name=config_run['name'],
            notes=config_run['notes'],
            config=config)   
        
    # applying hold-out before scaling
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=config['test_size'], 
#                                                           random_state=config['random_state']
#                                                          )
    
    # strictly speaking should do the below, but doing beforehand faster and fine in this context
    # scaling (i.e. normalizing)
#     scaler = config['scaler']()
#     X_train_s = scaler.fit_transform(X_train)
#     X_valid_s = scaler.fit_transform(X_valid)
    
    # selecting features
#     selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                           k=config['k_best'])
#     X_train_fs = selector.fit_transform(X_train_s, y_train)
#     X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    # split the dataset
    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
#         test_size=config['test_size'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    if config['wandb']:
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    else:
        model.fit(X_train, y_train)
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    if config['wandb']:
        wandb.log({'mse':mse, 'rmse':rmse})
        wandb.finish()   
    return model
    

# Holdout

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=config['test_size'], 
                                                      random_state=config['random_state']
                                                     )

model = train(X_train, X_valid, y_train, y_valid, config)
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)
dump(model, Path(model_path/f"xgboost_holdout_model.joblib"))

MSE is 61.858915233825
RMSE is 7.865043879968185


['models/feature_selection_test_20210827_123539_1folds/xgboost_holdout_model.joblib']

# Feature Selection - By Importance

In [18]:
fi = pd.Series(model.feature_importances_, index=X_poly_names)

In [19]:
fi_sorted = fi.sort_values(ascending=False)

In [20]:
pd.options.display.max_rows = None

In [21]:
fi_sorted

f25 f52    0.020431
f77^2      0.012733
f25        0.011736
f13 f52    0.011625
f25 f69    0.010858
f58 f66    0.010317
f41 f58    0.010305
f13 f46    0.009665
f3 f13     0.008993
f13 f53    0.008776
f31 f74    0.008750
f2 f41     0.008423
f9 f81     0.008403
f12 f81    0.008257
f70 f74    0.007701
f81^2      0.007576
f66 f73    0.007076
f66 f69    0.006965
f25 f96    0.006532
f28 f46    0.006239
f69 f73    0.006145
f58 f81    0.006046
f3 f46     0.006023
f66 f96    0.005936
f64 f79    0.005798
f21 f52    0.005718
f25 f34    0.005686
f27 f52    0.005661
f51 f78    0.005654
f13 f65    0.005638
f25 f58    0.005546
f39 f74    0.005534
f3 f96     0.005395
f55 f96    0.005393
f58 f69    0.005390
f44 f52    0.005263
f31 f81    0.005193
f21 f25    0.005116
f93        0.005018
f80        0.004830
f50 f58    0.004812
f1 f69     0.004751
f46 f48    0.004747
f1 f76     0.004693
f43 f69    0.004682
f31 f84    0.004563
f46 f84    0.004561
f28 f64    0.004485
f2^2       0.004465
f1 f53     0.004449


In [22]:
fi_sorted_candidates = fi_sorted[fi_sorted > 0.001]

In [23]:
len(fi_sorted_candidates)

288

In [24]:
fi_sorted_candidates.to_csv('features_deg2_fi_gt_0-001.csv')

So, let's try running a baseline model on just this subset of features.

In [26]:
feature_candidates = fi_sorted_candidates.index
feature_candidates

Index(['f25 f52', 'f77^2', 'f25', 'f13 f52', 'f25 f69', 'f58 f66', 'f41 f58',
       'f13 f46', 'f3 f13', 'f13 f53',
       ...
       'f29 f93', 'f61 f91', 'f27 f48', 'f32 f74', 'f51 f64', 'f3 f59',
       'f21 f67', 'f15 f94', 'f84 f89', 'f26 f30'],
      dtype='object', length=288)

In [28]:
X_df = pd.DataFrame(X, columns=X_poly_names)

In [31]:
X_candidates = X_df[feature_candidates]
X_candidates.head()

Unnamed: 0,f25 f52,f77^2,f25,f13 f52,f25 f69,f58 f66,f41 f58,f13 f46,f3 f13,f13 f53,...,f29 f93,f61 f91,f27 f48,f32 f74,f51 f64,f3 f59,f21 f67,f15 f94,f84 f89,f26 f30
0,-0.013632,0.000648,0.211775,-0.011511,0.014628,0.164931,0.177589,0.049392,-0.178317,0.140309,...,0.090836,-0.000353,-0.031811,-0.007742,0.389339,-0.056545,0.132593,0.093647,0.137825,0.404422
1,0.30052,0.000591,0.35911,0.015517,0.033837,0.082567,0.05316,0.013522,-0.007262,0.010177,...,0.032727,-0.001632,-0.212345,-0.00574,0.265136,-0.148204,-0.017949,-0.010874,0.034847,0.624719
2,0.576829,0.001288,0.56452,0.209392,0.044458,0.257001,0.14687,0.047144,-0.057899,0.157483,...,0.246486,8.7e-05,0.101054,0.069513,0.360654,-0.190969,0.152634,0.018086,0.001217,0.523216
3,0.019243,0.000241,0.187012,0.020118,0.017051,0.331025,0.095483,0.048812,0.136626,0.140518,...,0.065283,6.9e-05,-0.171226,0.114613,0.254374,-0.011757,0.08362,0.051047,0.035232,0.468144
4,-0.078783,0.009632,0.319205,-0.114307,-0.051615,0.179558,0.130038,0.018112,-0.031648,0.300555,...,0.150962,0.003546,-0.022264,-0.060327,0.24892,0.000855,0.091393,-0.063559,0.002237,0.814026


In [33]:
X_candidates.to_csv('X_candidates_20210827.csv')

In [34]:
X_candidates.shape

(250000, 288)

# Test of Feature Selection
Now, let's try running the model with the same hyperparameters and see the result. I'll enable WandB tracking.

In [38]:
config = {
    # model config
    "wandb": True,
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 50, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": StandardScaler,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 1,
    'features_created': True,
    'feature_creator': PolynomialFeatures,
    'feature_selection': 'manual by feature importance'
}

config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'scaling', 'experimental', 'features-created', 'features-selected'],
    'notes': "Trying the 288 feature dataset with StandardScaler",
}

In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(X_candidates, y, 
                                                      test_size=config['test_size'], 
                                                      random_state=config['random_state']
                                                     )

model = train(X_train, X_valid, y_train, y_valid, config)
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)
dump(model, Path(model_path/f"xgboost_holdout_model.joblib"))

[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




MSE is 61.85831786782834
RMSE is 7.8650059038648115


VBox(children=(Label(value=' 0.40MB of 0.40MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mse,61.85832
rmse,7.86501
_runtime,31.0
_timestamp,1630096939.0
_step,50.0


0,1
mse,▁
rmse,▁
_runtime,▁
_timestamp,▁
_step,▁


['models/feature_selection_test_20210827_134141_1folds/xgboost_holdout_model.joblib']

**The 288-feature dataset performs solidly better than the control group -- AND actually better than using *all* the 5000+ PolynomialFeatures.**

---------------------------

# K-fold Cross-validation

In [13]:
# ACTUALLY probably better to save those as pickles or .npy files; I'll generate them later, regardless
# results = {} # for storing k-fold models' predictions

In [14]:
kfold = KFold(n_splits=config['k_folds'], shuffle=True, random_state=config['random_state'])

In [16]:
models = {}

In [17]:
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)

In [18]:
for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#     if fold == 0:
#         continue
#     else:
    print(f"FOLD {fold}")
    print("-----------------------------------------")
    X_train, X_valid = X[train_ids], X[valid_ids]
    y_train, y_valid = y[train_ids], y[valid_ids]
    model = train(X_train, X_valid, y_train, y_valid, config)
    wandb.log({'fold': fold})
    models[fold] = model
    dump(model, Path(model_path/f"xgboost_fold{fold}_model.joblib"))
    wandb.finish()

FOLD 0
-----------------------------------------


[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




KeyboardInterrupt: 

In [30]:

#     dump(preds, f"./preds/{config_rn['name']}/xgboost_fold{fold}_preds.joblib")

# Inference

In [24]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)

In [25]:
test_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
250000,0.812665,15,-1.23912,-0.893251,295.577,15.8712,23.0436,0.942256,29.898,1.11394,...,0.446389,-422.332,-1.4463,1.69075,1.0593,-3.01057,1.94664,0.52947,1.38695,8.78767
250001,0.190344,131,-0.501361,0.801921,64.8866,3.09703,344.805,0.807194,38.4219,1.09695,...,0.377179,10352.2,21.0627,1.84351,0.251895,4.44057,1.90309,0.248534,0.863881,11.7939
250002,0.919671,19,-0.057382,0.901419,11961.2,16.3965,273.24,-0.0033,37.94,1.15222,...,0.99014,3224.02,-2.25287,1.551,-0.559157,17.8386,1.83385,0.931796,2.33687,9.054
250003,0.860985,19,-0.549509,0.471799,7501.6,2.80698,71.0817,0.792136,0.395235,1.20157,...,1.39688,9689.76,14.7715,1.4139,0.329272,0.802437,2.23251,0.893348,1.35947,4.84833
250004,0.313229,89,0.588509,0.167705,2931.26,4.34986,1.57187,1.1183,7.75463,1.16807,...,0.862502,2693.35,44.1805,1.5802,-0.191021,26.253,2.68238,0.361923,1.5328,3.7066


In [26]:
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [29]:
# applying hold-out before scaling
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=config['test_size'], 
                                                      random_state=config['random_state']
                                                     )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_train_fs = selector.fit_transform(X_train_s, y_train)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_train_fs, y_train)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [30]:
y_test_preds = model.predict(X_test_fs)



In [31]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [32]:
sample_df.loc[:, 'loss'] = y_test_preds

In [33]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.235917
1,250001,4.625789
2,250002,7.081776
3,250003,6.641549
4,250004,7.322997


In [34]:
sample_df.to_csv('202108241140_XGBoost.csv', index=False)

This got 7.90537 on the LB -- worse than before feature selection.

# Experiment - fitting model on full training set

In [36]:
# applying hold-out before scaling
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=config['test_size'], 
#                                                       random_state=config['random_state']
#                                                      )
# scaling (i.e. normalizing)
scaler = config['scaler']()
X_s = scaler.fit_transform(X)
X_test_s = scaler.fit_transform(X_test)

# selecting features
selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                      k=config['k_best'])
X_fs = selector.fit_transform(X_s, y)
X_test_fs = X_test_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config['tree_method'],
    booster=config['booster'],
    n_estimators=config['n_estimators'], 
    max_depth=config['max_depth'],
    learning_rate=config['learning_rate'], 
    test_size=config['test_size'],
    subsample=config['subsample'],
    random_state=config['random_state'],
    n_jobs=config['n_jobs'], 
    verbosity=config['verbosity'], 
)
#     wandb.log({'params': model.get_params()}) # logging model parameters
model.fit(X_fs, y)#, callbacks=[wandb.xgboost.wandb_callback()])

Parameters: { "test_size" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1522, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=-1, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             test_size=0.2, tree_method='auto', validate_parameters=1,
             verbosity=1)

In [37]:
y_test_preds = model.predict(X_test_fs)



In [38]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [39]:
sample_df.loc[:, 'loss'] = y_test_preds

In [40]:
sample_df.head()

Unnamed: 0,id,loss
0,250000,8.027956
1,250001,4.305676
2,250002,7.300106
3,250003,6.988875
4,250004,7.316631


In [41]:
sample_df.to_csv('202108241211_XGBoost_fullset.csv', index=False)