In [1]:
# Data processing
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# Machine Learning
import optuna
import xgboost as xgb
from optuna.samplers import TPESampler
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

In [2]:
input_dir = Path('../input/tabular-playground-series-aug-2021/')
train_df = pd.read_csv(input_dir / 'train.csv')
test_df = pd.read_csv(input_dir / 'test.csv')
sample_submission = pd.read_csv(input_dir / 'sample_submission.csv')

In [3]:
train_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [4]:
test_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,250000,0.812665,15,-1.23912,-0.893251,295.577,15.8712,23.0436,0.942256,29.898,...,0.446389,-422.332,-1.4463,1.69075,1.0593,-3.01057,1.94664,0.52947,1.38695,8.78767
1,250001,0.190344,131,-0.501361,0.801921,64.8866,3.09703,344.805,0.807194,38.4219,...,0.377179,10352.2,21.0627,1.84351,0.251895,4.44057,1.90309,0.248534,0.863881,11.7939
2,250002,0.919671,19,-0.057382,0.901419,11961.2,16.3965,273.24,-0.0033,37.94,...,0.99014,3224.02,-2.25287,1.551,-0.559157,17.8386,1.83385,0.931796,2.33687,9.054
3,250003,0.860985,19,-0.549509,0.471799,7501.6,2.80698,71.0817,0.792136,0.395235,...,1.39688,9689.76,14.7715,1.4139,0.329272,0.802437,2.23251,0.893348,1.35947,4.84833
4,250004,0.313229,89,0.588509,0.167705,2931.26,4.34986,1.57187,1.1183,7.75463,...,0.862502,2693.35,44.1805,1.5802,-0.191021,26.253,2.68238,0.361923,1.5328,3.7066


In [5]:
sample_submission.head()

Unnamed: 0,id,loss
0,250000,0
1,250001,0
2,250002,0
3,250003,0
4,250004,0


In [6]:
X = train_df.drop(['id', 'loss'], axis=1).values
y = train_df['loss'].values
X_test = test_df.drop(['id'], axis=1).values

In [7]:
# I've found many using MinMaxScaling but I've personally had better results with StandardScaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [8]:
y_min = y.min()
y_max = y.max()

# While it's probably rare that values will fall outside the y-min-max range, we should probably do it anyway.
def my_rmse(y_true, y_hat):
    y_true[y_true < y_min] = y_min
    y_true[y_true > y_max] = y_max
    
    y_true[y_hat < y_min] = y_min
    y_true[y_hat > y_max] = y_max
    
    return mean_squared_error(y_true, y_hat, squared=False)

In [9]:
def objective(trial):
    # Split the train data for each trial.
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.4)

    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 6, 10), # Extremely prone to overfitting!
        'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400), # Extremely prone to overfitting!
        'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter - the learning rate!
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20), # I've had trouble with LB score until tuning this.
        'reg_lambda': trial.suggest_int('reg_lambda', 1, 50), # L2 regularization
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 50), # L1 regularization
    } 
    
    reg = xgb.XGBRegressor(
        # These parameters should help with trial speed.
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        n_jobs=4,
        **param_grid
    )
    
    reg.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], eval_metric='rmse',
            verbose=False)

    # Returns the best RMSE for the trial.
    # Readers may want to try returning a cross validation score here.
    return my_rmse(y_valid, reg.predict(X_valid))

In [10]:
train_time = 1 * 10 * 60 # Train for up to ten minutes.
study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='XGBRegressor')
study.optimize(objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

[32m[I 2021-08-12 15:36:01,137][0m A new study created in memory with name: XGBRegressor[0m
[32m[I 2021-08-12 15:36:17,467][0m Trial 0 finished with value: 7.851809956108182 and parameters: {'max_depth': 7, 'n_estimators': 1200, 'eta': 0.008016892195827742, 'subsample': 0.5, 'colsample_bytree': 0.7, 'min_child_weight': 14, 'reg_lambda': 45, 'reg_alpha': 32}. Best is trial 0 with value: 7.851809956108182.[0m
[32m[I 2021-08-12 15:38:12,954][0m Trial 1 finished with value: 7.845563648754793 and parameters: {'max_depth': 9, 'n_estimators': 3600, 'eta': 0.010765132145606093, 'subsample': 0.7, 'colsample_bytree': 0.6, 'min_child_weight': 16, 'reg_lambda': 35, 'reg_alpha': 43}. Best is trial 1 with value: 7.845563648754793.[0m
[32m[I 2021-08-12 15:38:28,463][0m Trial 2 finished with value: 7.868495687741906 and parameters: {'max_depth': 10, 'n_estimators': 400, 'eta': 0.011575825122102928, 'subsample': 0.6, 'colsample_bytree': 0.6, 'min_child_weight': 18, 'reg_lambda': 49, 'reg_alp

Number of finished trials:  14
Best trial:
	Value: 7.838074537665272
	Params: 
		max_depth: 7
		n_estimators: 3200
		eta: 0.009237015452299387
		subsample: 0.6
		colsample_bytree: 0.7
		min_child_weight: 7
		reg_lambda: 16
		reg_alpha: 48


In [11]:
# Fetch the best trial parameters and set some settings for the KFold predictions.
xgb_params = trial.params
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'
xgb_params['n_jobs'] = 4

n_splits = 10
test_preds = None
kf_rmse = []

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X, y)):
    # Fetch the train-validation indices.
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    # Create and fit a new model using the best parameters.
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='rmse', verbose=False)
    
    # Validation predictions.
    valid_pred = model.predict(X_valid)
    rmse = my_rmse(y_valid, valid_pred)
    print(f'Fold {fold+1}/{n_splits} RMSE: {rmse:.4f}')
    kf_rmse.append(rmse)
    
    # Use the model trained for 1/n_splits of the output predictions.
    if test_preds is None:
        test_preds = model.predict(X_test)
    else:
        # This is kind of naughty for numerical accuracy (may overflow on other problems) but slightly quicker.
        test_preds += model.predict(X_test)

test_preds /= n_splits
print(f'Average KFold RMSE: {np.mean(np.array(kf_rmse)):.5f}')

Fold 1/10 RMSE: 7.7576
Fold 2/10 RMSE: 7.7580
Fold 3/10 RMSE: 7.8174
Fold 4/10 RMSE: 7.8769
Fold 5/10 RMSE: 7.9492
Fold 6/10 RMSE: 7.8064
Fold 7/10 RMSE: 7.8579
Fold 8/10 RMSE: 7.8419
Fold 9/10 RMSE: 7.8488
Fold 10/10 RMSE: 7.8368
Average KFold RMSE: 7.83508


In [12]:
test_preds[test_preds < y_min] = y_min
test_preds[test_preds > y_max] = y_max
sample_submission['loss'] = test_preds
sample_submission.to_csv('submission.csv', index=False)
sample_submission

Unnamed: 0,id,loss
0,250000,8.069003
1,250001,4.605507
2,250002,8.625629
3,250003,7.477079
4,250004,7.020336
...,...,...
149995,399995,7.774494
149996,399996,7.385503
149997,399997,5.950618
149998,399998,5.088622
