In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns
from datetime import datetime

from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

In [2]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = 'feature_selection_test_20210827.ipynb'
config = {
    # model config
    "wandb": False,
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 50, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 1,
    'features_created': False,
    'feature_creator': None,
}

config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'kfold', 'scaling', 'baseline'],
    'notes': "Control run for 50-estimator run with features created with PolynomialFeatures",
}

In [3]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [4]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

# load unaltered dataset
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [5]:
y = df.loss

In [6]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [7]:
# X.columns

In [8]:
# prep features from unaltered dataset
features = [x for x in df.columns if x != 'loss']
X = df[features]

In [9]:
poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
X_poly = poly.fit_transform(X)

In [10]:
X_poly_names = poly.get_feature_names(X.columns)

In [11]:
X = pd.DataFrame(X_poly, columns=X_poly_names)

In [12]:
scaler = config['scaler']()
X = scaler.fit_transform(X)

In [13]:
def train(X_train, X_valid, y_train, y_valid, config):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param config: dict with things to be logged in WandB, some to be used in function
    """
    
    
    if config['wandb']:
        wandb.init(
            project="202108_Kaggle_tabular_playground",
            save_code=True,
            tags=config_run['tags'],
            name=config_run['name'],
            notes=config_run['notes'],
            config=config)   
        
    # applying hold-out before scaling
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=config['test_size'], 
#                                                           random_state=config['random_state']
#                                                          )
    
    # strictly speaking should do the below, but doing beforehand faster and fine in this context
    # scaling (i.e. normalizing)
#     scaler = config['scaler']()
#     X_train_s = scaler.fit_transform(X_train)
#     X_valid_s = scaler.fit_transform(X_valid)
    
    # selecting features
#     selector = config['feature_selector'](score_func=config["feature_selection_scoring"], 
#                                           k=config['k_best'])
#     X_train_fs = selector.fit_transform(X_train_s, y_train)
#     X_valid_fs = X_valid_s[:, selector.get_support()] # ensures same features are used in validation

    # split the dataset
    model = XGBRegressor(
        tree_method=config['tree_method'],
        booster=config['booster'],
        n_estimators=config['n_estimators'], 
        max_depth=config['max_depth'],
        learning_rate=config['learning_rate'], 
#         test_size=config['test_size'],
        subsample=config['subsample'],
        random_state=config['random_state'],
        n_jobs=config['n_jobs'], 
        verbosity=config['verbosity'], 
    )
#     wandb.log({'params': model.get_params()}) # logging model parameters
    if config['wandb']:
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    else:
        model.fit(X_train, y_train)
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    if config['wandb']:
        wandb.log({'mse':mse, 'rmse':rmse})
        wandb.finish()   
    return model
    

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=config['test_size'], 
                                                      random_state=config['random_state']
                                                     )

model = train(X_train, X_valid, y_train, y_valid, config)
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)
dump(model, Path(model_path/f"xgboost_holdout_model.joblib"))

['models/feature_selection_test_20210827_123539_1folds/xgboost_holdout_model.joblib']

In [15]:
model.feature_importances_

array([0.0000000e+00, 1.5566899e-05, 0.0000000e+00, ..., 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00], dtype=float32)

In [16]:
fi = pd.Series(model.feature_importances_, index=X.columns)

In [17]:
fi = pd.Series(model.feature_importances_, index=X_poly_names

In [18]:
fi = pd.Series(model.feature_importances_, index=X_poly_names)

In [19]:
fi_sorted = fi.sort_values(ascending=False)

In [20]:
pd.options.display.max_rows = None

In [21]:
fi_sorted

f25 f52    0.020431
f77^2      0.012733
f25        0.011736
f13 f52    0.011625
f25 f69    0.010858
f58 f66    0.010317
f41 f58    0.010305
f13 f46    0.009665
f3 f13     0.008993
f13 f53    0.008776
f31 f74    0.008750
f2 f41     0.008423
f9 f81     0.008403
f12 f81    0.008257
f70 f74    0.007701
f81^2      0.007576
f66 f73    0.007076
f66 f69    0.006965
f25 f96    0.006532
f28 f46    0.006239
f69 f73    0.006145
f58 f81    0.006046
f3 f46     0.006023
f66 f96    0.005936
f64 f79    0.005798
f21 f52    0.005718
f25 f34    0.005686
f27 f52    0.005661
f51 f78    0.005654
f13 f65    0.005638
f25 f58    0.005546
f39 f74    0.005534
f3 f96     0.005395
f55 f96    0.005393
f58 f69    0.005390
f44 f52    0.005263
f31 f81    0.005193
f21 f25    0.005116
f93        0.005018
f80        0.004830
f50 f58    0.004812
f1 f69     0.004751
f46 f48    0.004747
f1 f76     0.004693
f43 f69    0.004682
f31 f84    0.004563
f46 f84    0.004561
f28 f64    0.004485
f2^2       0.004465
f1 f53     0.004449


In [22]:
fi_sorted_candidates = fi_sorted[fi_sorted > 0.001]

In [23]:
len(fi_sorted_candidates)

288

In [24]:
fi_sorted_candidates.to_csv('features_deg2_fi_gt_0-001.csv')

In [25]:
feature_candidates = fi_sorted_candidates.index

In [26]:
feature_candidates = fi_sorted_candidates.index
feature_candidates

Index(['f25 f52', 'f77^2', 'f25', 'f13 f52', 'f25 f69', 'f58 f66', 'f41 f58',
       'f13 f46', 'f3 f13', 'f13 f53',
       ...
       'f29 f93', 'f61 f91', 'f27 f48', 'f32 f74', 'f51 f64', 'f3 f59',
       'f21 f67', 'f15 f94', 'f84 f89', 'f26 f30'],
      dtype='object', length=288)

In [27]:
X_candidates = X[feature_candidates]

In [28]:
X_df = pd.DataFrame(X, columns=X_poly_names)

In [29]:
X_df.head()

     1        f0        f1        f2        f3        f4        f5        f6  \
0  1.0 -0.002192  0.216117  0.078490 -0.803754  0.001117  0.475674  0.032054   
1  1.0  0.731726  0.531136 -0.047483 -0.315691  0.721974  0.099078  0.169455   
2  1.0  0.296451  0.069597 -0.044282 -0.227739  0.036548  0.561973  0.032838   
3  1.0  0.196585  0.062271 -0.063106  0.563247 -0.003151  0.116379  0.195596   
4  1.0  0.410114  0.073260  0.099106 -0.055080  0.001963  0.350832  0.076214   

         f7        f8  ...     f96^2   f96 f97   f96 f98   f96 f99     f97^2  \
0  0.305884  0.000780  ...  0.107529  0.311124  0.177084  0.112140  0.698859   
1  0.199853  0.007430  ...  0.021583  0.066241  0.081690  0.005478  0.157830   
2 -0.124389  0.068476  ...  0.158047  0.304203  0.158620  0.159486  0.454555   
3  0.333884 -0.094511  ...  0.222009  0.298620  0.161368  0.154051  0.311825   
4 -0.056471  0.048525  ...  0.155064  0.334358  0.165270  0.090787  0.559704   

    f97 f98   f97 f99     f98^2   f98 

In [30]:
X_candidates = X[feature_candidates]
X_candidates.head()

In [31]:
X_candidates = X_df[feature_candidates]
X_candidates.head()

    f25 f52     f77^2       f25   f13 f52   f25 f69   f58 f66   f41 f58  \
0 -0.013632  0.000648  0.211775 -0.011511  0.014628  0.164931  0.177589   
1  0.300520  0.000591  0.359110  0.015517  0.033837  0.082567  0.053160   
2  0.576829  0.001288  0.564520  0.209392  0.044458  0.257001  0.146870   
3  0.019243  0.000241  0.187012  0.020118  0.017051  0.331025  0.095483   
4 -0.078783  0.009632  0.319205 -0.114307 -0.051615  0.179558  0.130038   

    f13 f46    f3 f13   f13 f53  ...   f29 f93   f61 f91   f27 f48   f32 f74  \
0  0.049392 -0.178317  0.140309  ...  0.090836 -0.000353 -0.031811 -0.007742   
1  0.013522 -0.007262  0.010177  ...  0.032727 -0.001632 -0.212345 -0.005740   
2  0.047144 -0.057899  0.157483  ...  0.246486  0.000087  0.101054  0.069513   
3  0.048812  0.136626  0.140518  ...  0.065283  0.000069 -0.171226  0.114613   
4  0.018112 -0.031648  0.300555  ...  0.150962  0.003546 -0.022264 -0.060327   

    f51 f64    f3 f59   f21 f67   f15 f94   f84 f89   f26 f30  
0  0

In [32]:
X_df['f25'].head()

0    0.211775
1    0.359110
2    0.564520
3    0.187012
4    0.319205
Name: f25, dtype: float64

In [33]:
X_candidates.to_csv('X_candidates_20210827.csv')

In [34]:
X_candidates.shape

(250000, 288)

In [35]:
config['wandb'] = True

In [36]:
config = {
    # model config
    "wandb": True,
    "model":XGBRegressor,
    "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
    "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
    "n_estimators": 50, 
    "max_depth": 3,
    "learning_rate": 0.1522,
    "test_size": 0.2,
    "scaler": MaxAbsScaler,
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'subsample': 1,
    'n_jobs': -1,
    'verbosity': 1,
    'k_folds': 1,
    'features_created': True,
    'feature_creator': PolynomialFeatures,
    'feature_selection': 'manual by feature importance'
}

config_run = {
    # wandb config:
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['XGBoost', 'scaling', 'baseline', 'features-created', 'features-selected'],
    'notes': "Picked top 288 features by feature importance from the degree-2 PolynomialFeatures created bunch",
}

In [37]:
X_train, X_valid, y_train, y_valid = train_test_split(X_candidates, y, 
                                                      test_size=config['test_size'], 
                                                      random_state=config['random_state']
                                                     )

model = train(X_train, X_valid, y_train, y_valid, config)
model_path = Path(f"./models/{config_run['name']}_{config['k_folds']}folds/")
(model_path).mkdir(exist_ok=True)
dump(model, Path(model_path/f"xgboost_holdout_model.joblib"))