In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.options.display.max_columns = None

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, PredefinedSplit
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

from category_encoders import TargetEncoder

# models
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval

In [None]:
inlocal = True
if inlocal:
  path_to_data = '../Data/created/'
else:
  from google.colab import drive
  drive.mount('/content/drive')
  path_to_data = '/content/drive/My Drive/WiDS 2023/data/'

In [None]:
# constants
target = 'contest_tmp2m_14d__tmp2m'
climateregions__climateregion = 'climateregions__climateregion'

In [None]:
# load train, val, test and submit data
train = pd.read_csv(path_to_data + 'train.csv')
val = pd.read_csv(path_to_data + 'val.csv')
test = pd.read_csv(path_to_data + 'test.csv')
train_sample = pd.read_csv(path_to_data + 'train_sample.csv')

X_train, y_train = train.drop(target, axis=1), train[target]
X_train_sample, y_train_sample = train_sample.drop(target, axis=1), train_sample[target]
X_val, y_val = val.drop(target, axis=1), val[target]
X_test, y_test = test.drop(target, axis=1), test[target]
X_submit = pd.read_csv(path_to_data + 'submit.csv')

In [None]:
# target encoding categorical features
cat_encoder = TargetEncoder()
X_train[climateregions__climateregion] = cat_encoder.fit_transform(X_train[climateregions__climateregion], y_train)

for X_set in (X_val, X_test, X_submit, X_train_sample):
    X_set[climateregions__climateregion] = cat_encoder.transform(X_set[climateregions__climateregion])

## Train XGBoost

In [None]:
# baseline model
xgb_baseline = XGBRegressor(
  random_state=42,
  n_jobs=-1,
)
xgb_baseline.fit(X_train, y_train)
y_pred = xgb_baseline.predict(X_val)
print('baseline model RMSE:', mean_squared_error(y_val, y_pred, squared=False))

In [None]:
y_pred = xgb_baseline.predict(X_test)
print('baseline model RMSE:', mean_squared_error(y_test, y_pred, squared=False))

In [None]:
# define search space
space = {
    'n_estimators': hp.choice('n_estimators', [100, 200, 300, 400, 600]),
    'max_depth': hp.choice('max_depth', [1, 2, 3, 5, 7, 9, 11, 13]),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1.0),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 1.0),
    'min_child_weight': hp.uniform('min_child_weight', 0.0, 1.0),
}

# define objective function
def objective(space):
    model = XGBRegressor(
        objective='reg:squarederror',
        tree_method='gpu_hist',
        gpu_id=0,
        n_estimators=space['n_estimators'],
        max_depth=space['max_depth'],
        learning_rate=space['learning_rate'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        colsample_bylevel=space['colsample_bylevel'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        gamma=space['gamma'],
        min_child_weight=space['min_child_weight'],
        random_state=42,
        n_jobs=-1,
        eval_metric='rmse',
        early_stopping_rounds=10,
        verbose=0,
    )

    model.fit(
      X_train_sample, y_train_sample,
      eval_set=[(X_val, y_val)],
    )

    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=1,
    trials=trials,
    rstate=np.random.RandomState(42),
)

In [None]:
# train model with best hyperparameters
best_params = space_eval(space, best)
model = XGBRegressor(
    objective='reg:squarederror',
    tree_method='gpu_hist',
    gpu_id=0,
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    colsample_bylevel=best_params['colsample_bylevel'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda'],
    gamma=best_params['gamma'],
    min_child_weight=best_params['min_child_weight'],
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train, y_train)

In [None]:
y_test_pred = model.predict(X_test)
print('test RMSE:', mean_squared_error(y_test, y_test_pred, squared=False))

In [38]:
def take_submission(model, X_submit, submit_index, path_to_data, filename):
    y_submit_pred = model.predict(X_submit)
    submit = pd.DataFrame({
      'contest-tmp2m-14d__tmp2m': y_submit_pred,
      'index':submit_index
    })
    submit.to_csv(path_to_data + filename + '.csv')

In [36]:
def get_top_trails(trials, n=10):
    results = list(map(lambda trial: {
        'loss': trial['result']['loss'],
        'params': trial['misc']['vals'],
    }, trials.trials))
    results.sort(key=lambda x: x['loss'])
    return results[:n]

[{'state': 1,
  'tid': 0,
  'spec': None,
  'result': {'status': 'new'},
  'misc': {'tid': 0,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'colsample_bylevel': [0],
    'colsample_bytree': [0],
    'gamma': [0],
    'learning_rate': [0],
    'max_depth': [0],
    'min_child_weight': [0],
    'n_estimators': [0],
    'reg_alpha': [0],
    'reg_lambda': [0],
    'subsample': [0]},
   'vals': {'colsample_bylevel': [0.7420924529192455],
    'colsample_bytree': [0.6619995178577169],
    'gamma': [0.7155584591130996],
    'learning_rate': [0.014726206460487596],
    'max_depth': [5],
    'min_child_weight': [0.767856397765376],
    'n_estimators': [1],
    'reg_alpha': [0.8839266254726155],
    'reg_lambda': [0.620118997676641],
    'subsample': [0.9068277038201439]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2023, 1, 15, 14, 3, 44, 896000),
  'refresh_time': datetime.datetime(2023, 1, 15, 14, 3, 44, 896000)}]

In [None]:
submit_index = X_submit['index']
y_pred = model.predict(X_submit.drop('index', axis=1))
submit = pd.DataFrame({
  'contest-tmp2m-14d__tmp2m': y_pred,
  'index':submit_index
})