In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold

from optuna import Trial, visualization, create_study
from optuna.integration import XGBoostPruningCallback

In [None]:
df = pd.read_csv('../Data/created/train_processed.csv')
df_submit = pd.read_csv('../Data/created/test_processed.csv')
target = 'contest_tmp2m_14d__tmp2m'
X_train, y_train = df.drop(target, axis=1), df[target]
X_submit = df_submit

In [4]:
def objective(trial: Trial):
  results = []
  for train_index, valid_index in KFold(n_splits=5, shuffle=False).split(X_train, y_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'verbosity': 0,
        'booster': 'gbtree',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.1, 1.0),
        'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.1, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'random_state': 42
    }

    model = XGBRegressor(**params)
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], early_stopping_rounds=100, verbose=False, callbacks=[XGBoostPruningCallback(trial, 'validation_0-rmse')])
    results.append(model.best_score)

  return np.mean(results)


In [None]:
study = create_study(
  direction='minimize',
  study_name='XGBoost',
  load_if_exists=True
)
study.optimize(objective, n_trials=100)

In [None]:
import joblib
joblib.dump(study, '/content/drive/My Drive/WiDS 2023/xgboost_study.pkl')

In [None]:
study = joblib.load('/content/drive/My Drive/WiDS 2023/xgboost_study.pkl')
study.optimize(objective, n_trials=10)