In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.samplers import TPESampler

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
OPTUNA = False

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-aug-2021/train.csv
/kaggle/input/tabular-playground-series-aug-2021/test.csv


In [3]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')
train.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [4]:
test.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,250000,0.812665,15,-1.23912,-0.893251,295.577,15.8712,23.0436,0.942256,29.898,...,0.446389,-422.332,-1.4463,1.69075,1.0593,-3.01057,1.94664,0.52947,1.38695,8.78767
1,250001,0.190344,131,-0.501361,0.801921,64.8866,3.09703,344.805,0.807194,38.4219,...,0.377179,10352.2,21.0627,1.84351,0.251895,4.44057,1.90309,0.248534,0.863881,11.7939
2,250002,0.919671,19,-0.057382,0.901419,11961.2,16.3965,273.24,-0.0033,37.94,...,0.99014,3224.02,-2.25287,1.551,-0.559157,17.8386,1.83385,0.931796,2.33687,9.054
3,250003,0.860985,19,-0.549509,0.471799,7501.6,2.80698,71.0817,0.792136,0.395235,...,1.39688,9689.76,14.7715,1.4139,0.329272,0.802437,2.23251,0.893348,1.35947,4.84833
4,250004,0.313229,89,0.588509,0.167705,2931.26,4.34986,1.57187,1.1183,7.75463,...,0.862502,2693.35,44.1805,1.5802,-0.191021,26.253,2.68238,0.361923,1.5328,3.7066


In [5]:
train.isnull().sum()

id      0
f0      0
f1      0
f2      0
f3      0
       ..
f96     0
f97     0
f98     0
f99     0
loss    0
Length: 102, dtype: int64

In [6]:
test.isnull().sum()

id     0
f0     0
f1     0
f2     0
f3     0
      ..
f95    0
f96    0
f97    0
f98    0
f99    0
Length: 101, dtype: int64

In [7]:
X = train.drop(['id', 'loss'], axis=1)
y = train['loss']
X_test = test.drop(['id'], axis=1)

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [9]:
def objective(trial, data=X, target=y):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=0)
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': 4000,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 1e-5, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-5, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 1e-5, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 42,
        'use_label_encoder': False,
#         'eval_metric': 'rmse'
    }
    
    xgb_model = XGBRegressor(**params)
    
    xgb_model.fit(X_train, y_train,
                 early_stopping_rounds = 200,
                 eval_set=[(X_val, y_val)],
                 eval_metric='rmse',
                 verbose=False)
    
    preds = xgb_model.predict(X_val)
    
    return mean_squared_error(y_val, preds, squared=False)

In [10]:
if OPTUNA:
    TIME = 1 * 30 * 60 # h * m * s
    study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='xgbregressor')
    study.optimize(objective, n_trials=30)


In [11]:
if OPTUNA:
    best_params = study.best_trial.params

`{'max_depth': 10,
 'learning_rate': 0.005,
 'min_child_weight': 300,
 'gamma': 0.0009284956972705897,
 'alpha': 0.0005237330351000966,
 'lambda': 0.00011494884725082336,
 'colsample_bytree': 0.2804762665363134,
 'subsample': 0.4058810101637117}`

In [12]:
if OPTUNA:
    optuna.visualization.plot_optimization_history(study)

In [13]:
if OPTUNA:
    optuna.visualization.plot_param_importances(study)


In [14]:
if OPTUNA:
    best_params['tree_method'] = 'gpu_hist'
    best_params['booster'] = 'gbtree'
    best_params['eval_metric'] = 'rmse'
    best_params['random_state'] = 42
    best_params['use_label_encoder'] = False
#     best_params
else:
    best_params = {'max_depth': 10,
                 'learning_rate': 0.005,
                 'min_child_weight': 300,
                 'gamma': 0.0009284956972705897,
                 'alpha': 0.0005237330351000966,
                 'lambda': 0.00011494884725082336,
                 'colsample_bytree': 0.2804762665363134,
                 'subsample': 0.4058810101637117,
                 'tree_method': 'gpu_hist',
                 'booster': 'gbtree',
                 'eval_metric': 'rmse',
                 'random_state': 42,
                 'use_label_encoder': False}

`{'max_depth': 10,
 'learning_rate': 0.005,
 'min_child_weight': 300,
 'gamma': 0.0009284956972705897,
 'alpha': 0.0005237330351000966,
 'lambda': 0.00011494884725082336,
 'colsample_bytree': 0.2804762665363134,
 'subsample': 0.4058810101637117,
 'tree_method': 'gpu_hist',
 'booster': 'gbtree',
 'eval_metric': 'rmse',
 'random_state': 42,
 'use_label_encoder': False}`

In [15]:
preds = np.zeros(test.shape[0])
rmses = []

skf = KFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    
    print("Fold {}:".format(fold))
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = XGBRegressor(**best_params)
    
    model.fit(X_train, y_train,
             early_stopping_rounds=200,
             eval_set=[(X_val, y_val)],
             eval_metric='rmse',
             verbose=False)
    rmse = mean_squared_error(y_val, model.predict(X_val), squared=False)
    print("LogLoss: {}".format(rmse))
    rmses.append(rmse)
    print(model.predict(X_test).shape)
    preds += model.predict(X_test)
    
preds /= skf.n_splits

Fold 0:
LogLoss: 8.82459373600536
(150000,)
Fold 1:
LogLoss: 8.683935114237
(150000,)
Fold 2:
LogLoss: 8.852925120447443
(150000,)
Fold 3:
LogLoss: 8.7991684351624
(150000,)
Fold 4:
LogLoss: 8.799300477349158
(150000,)
Fold 5:
LogLoss: 8.842783522942486
(150000,)
Fold 6:
LogLoss: 8.796654371780248
(150000,)
Fold 7:
LogLoss: 8.81803847854558
(150000,)
Fold 8:
LogLoss: 8.824325065638122
(150000,)
Fold 9:
LogLoss: 8.757520243665203
(150000,)


In [16]:
print("Average RMSE: {}".format(np.mean(rmses)))

Average RMSE: 8.7999244565773


In [17]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
sample_submission['loss'] = preds
sample_submission.to_csv('xgb_submission.csv', index=False)
sample_submission

Unnamed: 0,id,loss
0,250000,3.104693
1,250001,2.842394
2,250002,3.034634
3,250003,3.013256
4,250004,3.085135
...,...,...
149995,399995,3.036356
149996,399996,2.986600
149997,399997,2.903387
149998,399998,2.947960
