In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import cv as xgboost_cv
from xgboost import train as xgboost_train
from sklearn.metrics import r2_score

from tqdm import tqdm
from itertools import product
import pickle


In [2]:
gens = pd.read_csv('data/gens.csv').amount.values

train_2 = pd.read_csv('data/weather_forecast.csv')
train_1 = train_2.loc[train_2['round']==1, :].copy()
train_2 = train_2.loc[train_2['round']==2, :].reset_index(drop=True)

pred_2 = pd.read_csv('data/pred.csv')
pred_1 = pred_2.loc[pred_2['round']==1, :].copy()
pred_2 = pred_2.loc[pred_2['round']==2, :].reset_index(drop=True)
pred_1_matrix = pred_1.amount.values.reshape(train_1.shape[0], 5)

In [3]:
train_1 = train_1.drop(columns=['round', 'time']).values
train_1 = np.concatenate([train_1, pred_1_matrix], axis=1)

In [4]:
for i in range(5):
    errors = np.abs(gens - pred_1_matrix[:, i]) / 99.0 * 100
    errors = errors.reshape(-1, 1)
    train_1 = np.concatenate([train_1, errors], axis=1)

In [5]:
def grid_search_xgb(params_default, dtrain):
    param_grid = {
        'max_depth': [3, 4, 5, 6],
        'colsample_bylevel': [0.9, 1.0],
        'colsample_bynode': [0.9, 1.0],
        'gamma': [0, 0.1, 0.01],
        'alpha': [0, 0.1, 0.01],
        'lambda': [0, 0.1, 0.01],
    }

    params_names = param_grid.keys()
    best_score_ = np.inf

    for params_train in tqdm(list(product(*param_grid.values()))):
        params_train = dict(zip(params_names, params_train))
        params = dict(params_default, **params_train)
        cv_log = xgboost_cv(params=params, dtrain=dtrain, nfold=5, num_boost_round=200, early_stopping_rounds=10).mean(axis=0)
        cv_score = cv_log['test-mae-mean']

        if cv_score < best_score_:
            best_score_ = cv_score
            best_params_ = params
    
    return best_params_, best_score_

In [6]:
params_default = {
    'tree_method': 'hist',
    'device': 'cuda',
    'eval_metric': 'mae',
    'objective': 'reg:absoluteerror',
    'verbosity': 0,
}

dtrain = xgb.DMatrix(train_1[:-168], gens[:-168])
dtest = xgb.DMatrix(train_1[-168:], gens[-168:])

best_params_, best_score_ = grid_search_xgb(params_default, dtrain)
print(best_params_)

100%|██████████| 432/432 [30:29<00:00,  4.24s/it]

{'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'mae', 'objective': 'reg:absoluteerror', 'verbosity': 0, 'max_depth': 6, 'colsample_bylevel': 0.9, 'colsample_bynode': 0.9, 'gamma': 0.1, 'alpha': 0.1, 'lambda': 0}





In [7]:
params = dict(params_default, **best_params_)
best_model = xgboost_train(params, dtrain, num_boost_round=200)

y_pred = best_model.predict(dtest)
r2 = r2_score(gens[-168:], y_pred)
print(f'r2 score: {r2:6.4f}')

r2 score: 0.9580


In [8]:
with open('checkpoints/weather+pred+error_prediction.xgb', 'wb') as f:
    pickle.dump(best_model, f)