In [1]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'inference_single_20210824.ipynb'
config_run = {
    'name': os.environ['WANDB_NOTEBOOK_NAME'][:-6], # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'baseline', 'feature_selection', 'scaling'],
    'notes': 
        """Notebook for doing inference only -- experiments are conducted elsewhere. Model is as stated; `MaxAbsScaler` and `SelectKBest(k=80)` seem to be the best options as of 20210824. (Though model hyperparams haven't been fine-tuned with the scaler and the feature selector as of yet.)
- Not doing an ensemble just yet.""",
}

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold#, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns

from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest, f_regression

# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'inference_single_20210824.ipynb'
config_run = {
    'name': os.environ['WANDB_NOTEBOOK_NAME'][:-6], # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'baseline', 'feature_selection', 'scaling'],
    'notes': 
        """Notebook for doing inference only -- experiments are conducted elsewhere. Model is as stated; `MaxAbsScaler` and `SelectKBest(k=80)` seem to be the best options as of 20210824. (Though model hyperparams haven't been fine-tuned with the scaler and the feature selector as of yet.)
- Not doing an ensemble just yet.""",
}

In [4]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [5]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [6]:
y = df.loss

In [7]:
features = [x for x in df.columns if x != 'loss']

In [8]:
X = df[features]

In [9]:
config = {
    "library": "xgboost",
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 100, # a very low number -- optimal is probably 300ish -- but this will be quicker
    "max_depth": 3,
    "learning_rate": 0.1,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
    "feature_selector": SelectKBest,
    "k_best": 80,
    "feature_selection_scoring": f_regression,
    'random_state': 42
}

In [10]:
config['scaler']

sklearn.preprocessing._data.MaxAbsScaler

In [11]:
?config['scaler']

In [12]:
s = config['scaler']
?s

In [13]:
config = {
    "library": "xgboost",
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 100, # a very low number -- optimal is probably 300ish -- but this will be quicker
    "max_depth": 3,
    "learning_rate": 0.1,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
    "feature_selector": SelectKBest,
    "k_best": 80,
    "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
}

In [14]:
def train(config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=config)   
        
    # applying hold-out before scaling
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=config['test_size'], 
                                                          random_state=config['random_state']
                                                         )
    # scaling (i.e. normalizing)
    scaler = config['scaler']
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.fit_transform(X_valid)
    
    # selecting features
    selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                          k=config['k_best'])
    X_train_fs = selector.fit_transform(X_train_s, y_train)
    X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
        test_size=config['test_size'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    model.fit(X_train_fs, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid_fs)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    wandb.finish()
    

In [15]:
train(config)

In [16]:
scaler = config['scaler']

In [17]:
scaler

sklearn.preprocessing._data.MaxAbsScaler

In [18]:
def train(config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=config)   
        
    # applying hold-out before scaling
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=config['test_size'], 
                                                          random_state=config['random_state']
                                                         )
    # scaling (i.e. normalizing)
    scaler = config['scaler']()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.fit_transform(X_valid)
    
    # selecting features
    selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                          k=config['k_best'])
    X_train_fs = selector.fit_transform(X_train_s, y_train)
    X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
        test_size=config['test_size'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    model.fit(X_train_fs, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid_fs)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    wandb.finish()
    

In [19]:
train(config)

In [20]:
def train(config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=config)   
        
    # applying hold-out before scaling
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=config['test_size'], 
                                                          random_state=config['random_state']
                                                         )
    # scaling (i.e. normalizing)
    scaler = config['scaler']()
    X_train_s = scaler.fit_transform(X_train)
    X_valid_s = scaler.fit_transform(X_valid)
    
    # selecting features
    selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
                                          k=config['k_best'])
    X_train_fs = selector.fit_transform(X_train_s, y_train)
    X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
        test_size=config['test_size'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    model.fit(X_train_fs, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid_fs)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    wandb.finish()
    

In [21]:
train(config)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…