In [1]:
baseline_rmse = 7.8619006924521

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold#, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns

# feature engineering tools
from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

In [3]:
%matplotlib inline

In [4]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torchinfo import summary
# # from fastai.callback.wandb import *
# from torch.cuda.amp import autocast, GradScaler

In [5]:
# from fastai.vision.all import *
from fastai.tabular.all import *
# from fastai.callback.wandb import WandbCallback

In [6]:
# import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
# # from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier

In [7]:
# %env "WANDB_NOTEBOOK_NAME" "202108090846_XGBoostRegressor_tree_sweep"
os.environ['WANDB_NOTEBOOK_NAME'] = '20210823_XGBClassifier_feature_selection.ipynb'

In [8]:
config_run = {
    'name': os.environ['WANDB_NOTEBOOK_NAME'][:-6], # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'experimental', 'feature_selection'],
    'notes': "Going to try doing some feature selection now using techniques from Abishek Thakur's book.",
}

In [9]:
%config Completer.use_jedi = False

In [10]:
# set_seed(42, reproducible=True) # fastai only

In [11]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [12]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [13]:
# df.isnull().sum().any() # False

In [14]:
# df.info(verbose=True, null_counts=True)

In [15]:
# df.describe()

In [16]:
y = df.loss

In [17]:
features = [x for x in df.columns if x != 'loss']

In [18]:
X = df[features]

In [19]:
# for f in df.columns:
#     print(f + '\n-----------')
#     print(f"{f} max is {max(df[f])}")
#     print(f"{f} min is {min(df[f])}")

In [20]:
# for f in df.columns[:5]:
#     sns.scatterplot(data=df, x=f, y=y)

In [21]:
len(X)

250000

In [22]:
df.head()

          f0   f1        f2        f3          f4        f5        f6  \
id                                                                      
0  -0.002350   59  0.766739 -1.350460     42.2727  16.68570   30.3599   
1   0.784462  145 -0.463845 -0.530421  27324.9000   3.47545  160.4980   
2   0.317816   19 -0.432571 -0.382644   1383.2600  19.71290   31.1026   
3   0.210753   17 -0.616454  0.946362   -119.2530   4.08235  185.2570   
4   0.439671   20  0.968126 -0.092546     74.3020  12.30650   72.1860   

          f7         f8       f9  ...        f91        f92      f93  \
id                                ...                                  
0   1.267300   0.392007  1.09101  ...  -42.43990  26.854000  1.45751   
1   0.828007   3.735860  1.28138  ... -184.13200   7.901370  1.70644   
2  -0.515354  34.430800  1.24210  ...    7.43721  37.218100  3.25339   
3   1.383310 -47.521400  1.09130  ...    9.66778   0.626942  1.49425   
4  -0.233964  24.399100  1.10151  ...  290.65700  15.604

In [23]:
len(y), len(y.value_counts())

(250000, 43)

In [24]:
len(X_train['f1'].unique())

In [25]:
config_defaults = {
    "library": "xgboost",
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 100, # a very low number -- optimal is probably 300ish -- but this will be quicker
    "max_depth": 3,
    "learning_rate": 0.1,
    "test_size": 0.2,
    "scaler": MaxAbsScaler
}

In [26]:
# import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler
# # from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier

In [27]:
# %env "WANDB_NOTEBOOK_NAME" "202108090846_XGBoostRegressor_tree_sweep"
os.environ['WANDB_NOTEBOOK_NAME'] = '20210824_XGBClassifier_feature_selection.ipynb'

In [28]:
config_defaults = {
    "library": "xgboost",
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 100, # a very low number -- optimal is probably 300ish -- but this will be quicker
    "max_depth": 3,
    "learning_rate": 0.1,
    "test_size": 0.2,
    "scaler": MaxAbsScaler
}

In [29]:
def train(wandb_config):#, scaler): # passed in via config dict for now
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=wandb_config)
    
    config = wandb.config
        
    # applying hold-out before scaling
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config['test_size'], random_state=42)
    
    wandb.log({'scaler': MaxAbsScaler})
    s = MaxAbsScaler()
    X_train = s.fit_transform(X_train)
    X_valid = s.fit_transform(X_valid)
    
#     # instantiating the scaler and fitting it
#     if scaler:
#         s = scaler()
#         X_train = s.fit_transform(X_train)
#         X_valid = s.fit_transform(X_valid)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        test_size=config.test_size,
        subsample=1,
        random_state=42,
        n_jobs=-1, 
        verbosity=1, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    wandb.finish()
    

In [30]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config['test_size'], random_state=config['random_state'])

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config_defaults['test_size'], random_state=config_defaults['random_state'])

In [32]:
config_defaults = {
    "library": "xgboost",
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 100, # a very low number -- optimal is probably 300ish -- but this will be quicker
    "max_depth": 3,
    "learning_rate": 0.1,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
    'random_state': 42
}

In [33]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config_defaults['test_size'], random_state=config_defaults['random_state'])

In [34]:
from sklearn.feature_selection import VarianceThreshold

In [35]:
from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_regression, SelectKBest, SelectPercentile

In [36]:
wandb.init(
    project="202108_Kaggle_tabular_playground",
    save_code=True,
    tags=config_run['tags'],
    name=config_run['name'],
    notes=config_run['notes'],
    config=model_config)


# applying hold-out before scaling
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=model_config['test_size'], random_state=42)

#     s = model_config['scaler']
#     wandb.log({'scaler':s)
s = MaxAbsScaler()
X_train_s = s.fit_transform(X_train)
X_valid_s = s.fit_transform(X_valid)
#     X_train_s = s.fit_transform(x=X_train, y=y_train)
#     X_valid_s = s.fit_transform(x=X_valid, y=y_valid)

# removing features with post-normalization variance <0.01
#     var_thresh = VarianceThreshold(threshold=0.017)
#     X_train_v = var_thresh.fit_transform(X_train_s)
#     X_valid_v = X_valid_s[:, var_thresh.get_support()]
# X_valid_v = var_thresh.fit_transform(X_valid_s)

#     # instantiating the scaler and fitting it
#     if scaler:
#         s = scaler()
#         X_train = s.fit_transform(X_train)
#         X_valid = s.fit_transform(X_valid)

selector = SelectKBest(score_func='f_regression', k=90)
X_train_fs = selector.fit_tarnsform(X_train_s)

In [37]:
model_config = config_defaults 

wandb.init(
    project="202108_Kaggle_tabular_playground",
    save_code=True,
    tags=config_run['tags'],
    name=config_run['name'],
    notes=config_run['notes'],
    config=model_config)


# applying hold-out before scaling
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=model_config['test_size'], random_state=42)

#     s = model_config['scaler']
#     wandb.log({'scaler':s)
s = MaxAbsScaler()
X_train_s = s.fit_transform(X_train)
X_valid_s = s.fit_transform(X_valid)
#     X_train_s = s.fit_transform(x=X_train, y=y_train)
#     X_valid_s = s.fit_transform(x=X_valid, y=y_valid)

# removing features with post-normalization variance <0.01
#     var_thresh = VarianceThreshold(threshold=0.017)
#     X_train_v = var_thresh.fit_transform(X_train_s)
#     X_valid_v = X_valid_s[:, var_thresh.get_support()]
# X_valid_v = var_thresh.fit_transform(X_valid_s)

#     # instantiating the scaler and fitting it
#     if scaler:
#         s = scaler()
#         X_train = s.fit_transform(X_train)
#         X_valid = s.fit_transform(X_valid)

selector = SelectKBest(score_func='f_regression', k=90)
X_train_fs = selector.fit_tarnsform(X_train_s)

In [38]:
model_config = config_defaults 

wandb.init(
    project="202108_Kaggle_tabular_playground",
    save_code=True,
    tags=config_run['tags'],
    name=config_run['name'],
    notes=config_run['notes'],
    config=model_config)


# applying hold-out before scaling
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=model_config['test_size'], random_state=42)

#     s = model_config['scaler']
#     wandb.log({'scaler':s)
s = MaxAbsScaler()
X_train_s = s.fit_transform(X_train)
X_valid_s = s.fit_transform(X_valid)
#     X_train_s = s.fit_transform(x=X_train, y=y_train)
#     X_valid_s = s.fit_transform(x=X_valid, y=y_valid)

# removing features with post-normalization variance <0.01
#     var_thresh = VarianceThreshold(threshold=0.017)
#     X_train_v = var_thresh.fit_transform(X_train_s)
#     X_valid_v = X_valid_s[:, var_thresh.get_support()]
# X_valid_v = var_thresh.fit_transform(X_valid_s)

#     # instantiating the scaler and fitting it
#     if scaler:
#         s = scaler()
#         X_train = s.fit_transform(X_train)
#         X_valid = s.fit_transform(X_valid)

selector = SelectKBest(score_func='f_regression', k=90)
X_train_fs = selector.fit_transform(X_train_s)

In [39]:
model_config = config_defaults 

wandb.init(
    project="202108_Kaggle_tabular_playground",
    save_code=True,
    tags=config_run['tags'],
    name=config_run['name'],
    notes=config_run['notes'],
    config=model_config)


# applying hold-out before scaling
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=model_config['test_size'], random_state=42)

#     s = model_config['scaler']
#     wandb.log({'scaler':s)
s = MaxAbsScaler()
X_train_s = s.fit_transform(X_train)
X_valid_s = s.fit_transform(X_valid)
#     X_train_s = s.fit_transform(x=X_train, y=y_train)
#     X_valid_s = s.fit_transform(x=X_valid, y=y_valid)

# removing features with post-normalization variance <0.01
#     var_thresh = VarianceThreshold(threshold=0.017)
#     X_train_v = var_thresh.fit_transform(X_train_s)
#     X_valid_v = X_valid_s[:, var_thresh.get_support()]
# X_valid_v = var_thresh.fit_transform(X_valid_s)

#     # instantiating the scaler and fitting it
#     if scaler:
#         s = scaler()
#         X_train = s.fit_transform(X_train)
#         X_valid = s.fit_transform(X_valid)

selector = SelectKBest(score_func='f_regression', k=90)
X_train_fs = selector.fit_transform(X_train_s, y_train)

In [40]:
model_config = config_defaults 

wandb.init(
    project="202108_Kaggle_tabular_playground",
    save_code=True,
    tags=config_run['tags'],
    name=config_run['name'],
    notes=config_run['notes'],
    config=model_config)


# applying hold-out before scaling
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=model_config['test_size'], random_state=42)

#     s = model_config['scaler']
#     wandb.log({'scaler':s)
s = MaxAbsScaler()
X_train_s = s.fit_transform(X_train)
X_valid_s = s.fit_transform(X_valid)
#     X_train_s = s.fit_transform(x=X_train, y=y_train)
#     X_valid_s = s.fit_transform(x=X_valid, y=y_valid)

# removing features with post-normalization variance <0.01
#     var_thresh = VarianceThreshold(threshold=0.017)
#     X_train_v = var_thresh.fit_transform(X_train_s)
#     X_valid_v = X_valid_s[:, var_thresh.get_support()]
# X_valid_v = var_thresh.fit_transform(X_valid_s)

#     # instantiating the scaler and fitting it
#     if scaler:
#         s = scaler()
#         X_train = s.fit_transform(X_train)
#         X_valid = s.fit_transform(X_valid)

selector = SelectKBest(score_func=f_regression, k=90)
X_train_fs = selector.fit_transform(X_train_s, y_train)

In [41]:
selector?

In [42]:
X_valid_fs = X_valid_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config_defaults.tree_method,
    booster=config_defaults.booster,
    n_estimators=config_defaults.n_estimators, 
    max_depth=config_defaults.max_depth,
    learning_rate=config_defaults.learning_rate, 
    test_size=config_defaults.test_size,
    subsample=1,
    random_state=config_defaults['random_state'],
    n_jobs=-1, 
    verbosity=1, 
)
wandb.log({'params': model.get_params()})
model.fit(X_train_fs, y_train, callbacks=[wandb.xgboost.wandb_callback()])
y_preds = model.predict(X_valid_fs)
mse = mean_squared_error(y_valid, y_preds)
rmse = math.sqrt(abs(mse))
wandb.log({'mse':mse, 'rmse':rmse})
print(f"MSE is {mse}\nRMSE is {rmse}")   
wandb.finish()
#     if rmse < baseline_rmse:
#         print("RMSE is improved")
#     else:
#         print("RMSE is not improved")    
    
# selector = SelectKBest(score_func='f_regression', k=90)
# X_train_fs = selector.fit_tarnsform(X_train_s)

In [43]:
X_valid_fs = X_valid_s[:, selector.get_support()]

model = XGBRegressor(
    tree_method=config_defaults['tree_method'],
    booster=config_defaults['booster'],
    n_estimators=config_defaults['n_estimators'], 
    max_depth=config_defaults['max_depth'],
    learning_rate=config_defaults['learning_rate'], 
    test_size=config_defaults['test_size'],
    subsample=1,
    random_state=config_defaults['random_state'],
    n_jobs=-1, 
    verbosity=1, 
)
wandb.log({'params': model.get_params()})
model.fit(X_train_fs, y_train, callbacks=[wandb.xgboost.wandb_callback()])
y_preds = model.predict(X_valid_fs)
mse = mean_squared_error(y_valid, y_preds)
rmse = math.sqrt(abs(mse))
wandb.log({'mse':mse, 'rmse':rmse})
print(f"MSE is {mse}\nRMSE is {rmse}")   
wandb.finish()
#     if rmse < baseline_rmse:
#         print("RMSE is improved")
#     else:
#         print("RMSE is not improved")    
    
# selector = SelectKBest(score_func='f_regression', k=90)
# X_train_fs = selector.fit_tarnsform(X_train_s)

In [44]:
rmse < baseline_rmse

True

In [45]:
def train_fs(k, model_config=config_defaults):
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=model_config)


    # applying hold-out before scaling
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=model_config['test_size'], random_state=42)

    #     s = model_config['scaler']
    #     wandb.log({'scaler':s)
    s = MaxAbsScaler()
    X_train_s = s.fit_transform(X_train)
    X_valid_s = s.fit_transform(X_valid)
    #     X_train_s = s.fit_transform(x=X_train, y=y_train)
    #     X_valid_s = s.fit_transform(x=X_valid, y=y_valid)

    # removing features with post-normalization variance <0.01
    #     var_thresh = VarianceThreshold(threshold=0.017)
    #     X_train_v = var_thresh.fit_transform(X_train_s)
    #     X_valid_v = X_valid_s[:, var_thresh.get_support()]
    # X_valid_v = var_thresh.fit_transform(X_valid_s)

    #     # instantiating the scaler and fitting it
    #     if scaler:
    #         s = scaler()
    #         X_train = s.fit_transform(X_train)
    #         X_valid = s.fit_transform(X_valid)

    selector = SelectKBest(score_func=f_regression, k=k)
    X_train_fs = selector.fit_transform(X_train_s, y_train)
    X_valid_fs = X_valid_s[:, selector.get_support()]

    model = XGBRegressor(
        tree_method=config_defaults['tree_method'],
        booster=config_defaults['booster'],
        n_estimators=config_defaults['n_estimators'], 
        max_depth=config_defaults['max_depth'],
        learning_rate=config_defaults['learning_rate'], 
        test_size=config_defaults['test_size'],
        subsample=1,
        random_state=config_defaults['random_state'],
        n_jobs=-1, 
        verbosity=1, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train_fs, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid_fs)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse, 'kbest':k})
    print(f"MSE is {mse}\nRMSE is {rmse} with {k}-best features.")
    wandb.finish()
    #     if rmse < baseline_rmse:
    #         print("RMSE is improved")
    #     else:
    #         print("RMSE is not improved")    

    # selector = SelectKBest(score_func='f_regression', k=90)
    # X_train_fs = selector.fit_tarnsform(X_train_s)

In [46]:
for k in range(80,101):
    train_fs(k)

In [47]:
for k in range(60,80):
    train_fs(k)