In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns
from datetime import datetime

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

In [2]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'XGBoost_ensemble_20210830.ipynb'
config = {
    # model config
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'dart', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 400, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
#     "task_type": "GPU", # for CatBoost only
#     "reg_alpha": 2.8,
#     "reg_lambda": 3.987,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 5,
    'features_created': True,
    'feature_creator': PolynomialFeatures,
}

config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'kfold', 'scaling', 'feature-creation', 'feature-selection'],
    'notes': "Running most of the parameters from the run j7tlo010 from the sweep b2zv3fsy, the most successful yet; just increasing n_estimators from 400 to 1000 and adding k-fold, feature generation/selection, etc",
}

In [3]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [4]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

# load unaltered dataset
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [5]:
y = df.loss

In [6]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [7]:
# X.columns

In [8]:
# prep features from unaltered dataset
features = [x for x in df.columns if x != 'loss']
X = df[features]

In [9]:
# generation polynomial features
poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
X_poly = poly.fit_transform(X)

In [10]:
X_poly_names = poly.get_feature_names(X.columns)
# X_poly_names[100:150]
features = pd.read_csv('X_candidates_20210827.csv').columns

In [11]:
checks = [feature in X_poly_names for feature in features]
# checks

In [12]:
X = pd.DataFrame(X_poly, columns=X_poly_names)

In [13]:
X = X[features[1:]]

In [14]:
scaler = config['scaler']()
X_scaled = scaler.fit_transform(X)
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [15]:
# ACTUALLY probably better to save those as pickles or .npy files; I'll generate them later, regardless
# results = {} # for storing k-fold models' predictions

In [16]:
kfold = KFold(n_splits=config['k_folds'], shuffle=True, random_state=config['random_state'])

In [17]:
def train(X_train, X_valid, y_train, y_valid, config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    wandb.init(
        project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=config)   
        
    # applying hold-out before scaling
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=config['test_size'], 
#                                                           random_state=config['random_state']
#                                                          )
    
    # strictly speaking should do the below, but doing beforehand faster and fine in this context
    # scaling (i.e. normalizing)
#     scaler = config['scaler']()
#     X_train_s = scaler.fit_transform(X_train)
#     X_valid_s = scaler.fit_transform(X_valid)
    
    # selecting features
#     selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                           k=config['k_best'])
#     X_train_fs = selector.fit_transform(X_train_s, y_train)
#     X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

#     # split the dataset
#     model = CatBoostRegressor(
#         n_estimators=config['n_estimators'],
#         learning_rate=config['learning_rate'],
#         max_depth=config['max_depth'],
#         task_type=config['task_type'],
# #         n_jobs=config['n_jobs'],
# #         verbosity=config['verbosity'],
# #         subsample=config['subsample'],
#         random_state=config['random_state'],
# #         bootstrap_type=config['bootstrap_type'],
# #         device:config['device']
#     ) 

    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
#         test_size=config['test_size'],
        subsample=config['subsample'],
#         reg_alpha=config['reg_alpha'],
#         reg_lambda=config['reg_lambda'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
#     wandb.finish()   
    return model
    

In [18]:
models = {}

In [19]:
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)

In [20]:
for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#     if fold == 0:
#         continue
#     else:
    print(f"FOLD {fold}")
    print("-----------------------------------------")
    X_train, X_valid = X_scaled[train_ids], X_scaled[valid_ids] # requires X to be a numpy.ndarray
    y_train, y_valid = y[train_ids], y[valid_ids]
    model = train(X_train, X_valid, y_train, y_valid, config)
    wandb.log({'fold': fold})
    models[fold] = model
    dump(model, Path(model_path/f"xgboost_fold{fold}_model.joblib"))
    wandb.finish()