In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns

from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

In [2]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'inference_ensemble_20210826a.ipynb'
config_run = {
    'name': os.environ['WANDB_NOTEBOOK_NAME'][:-6], # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'kfold', 'scaling'],
    'notes': 
        """Trying k-fold sans feature selection on a lower number of estimators, just to see what happens.""",
}

In [3]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [4]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [5]:
# def create_folds(data):
#     # placeholder value for the new column
#     data['kfold'] = -1 

#     # randomize the rows of the data
#     data = data.sample(frac=1).reset_index(drop=True)

#     # calculate number of bins by Sturge's rule (with floor)
#     num_bins = np.floor(1 + np.log2(len(data)))
    
#     # bin targets
#     data.loc[:, "bins"] = pd.cut(
#         data['loss'], bins=num_bins, labels=False
#     )
    
#     # initialize kfold class
#     kfold = StratifiedKFold(n_splits=5)
    
#     # fill the new kfold column, using bins
#     for f, (t_, v_) in enumerate(kfold.split(X=data, y=data.bins.values)):
#         data.loc[v_, 'kfold'] = f

#     # drop the bins column
#     data = data.drop("bins", axis=1)
    
#     # return dataframe with folds
#     return data

In [6]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)

# df_folds = create_folds(df)

# # save a new csv with a kfold column
# df_folds.to_csv(datapath/"train_folds.csv", index=False)

# # save a feather
# df_folds.to_feather(path='./dataset_df_kfold.feather')

In [7]:
# df = pd.read_feather(path='dataset_df_kfold.feather')
# df.index.name = 'id'

In [8]:
y = df.loss

In [9]:
features = [x for x in df.columns if x != 'loss']

In [10]:
X = df[features]

In [11]:
config = {
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 1000, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 5,
}

In [12]:
scaler = config['scaler']()
X = scaler.fit_transform(X)

In [13]:
# ACTUALLY probably better to save those as pickles or .npy files; I'll generate them later, regardless
# results = {} # for storing k-fold models' predictions

In [14]:
kfold = KFold(n_splits=config['k_folds'], shuffle=True, random_state=config['random_state'])

In [15]:
def train(X_train, X_valid, y_train, y_valid, config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=config)   
        
    # applying hold-out before scaling
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=config['test_size'], 
#                                                           random_state=config['random_state']
#                                                          )
    
    # strictly speaking should do the below, but doing beforehand faster and fine in this context
    # scaling (i.e. normalizing)
#     scaler = config['scaler']()
#     X_train_s = scaler.fit_transform(X_train)
#     X_valid_s = scaler.fit_transform(X_valid)
    
    # selecting features
#     selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                           k=config['k_best'])
#     X_train_fs = selector.fit_transform(X_train_s, y_train)
#     X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    # split the dataset
    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
#         test_size=config['test_size'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    wandb.finish()   
    return model
    

In [16]:
models = {}

In [17]:
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)

In [18]:
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)

In [19]:
for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#     if fold == 0:
#         continue
#     else:
    print(f"FOLD {fold}")
    print("-----------------------------------------")
    X_train, X_valid = X[train_ids], X[valid_ids]
    y_train, y_valid = y[train_ids], y[valid_ids]
    model = train(X_train, X_valid, y_train, y_valid, config)
    models[fold] = model
    dump(model, Path(model_path/f"xgboost_fold{fold}_model.joblib"))
    wandb.finish()