In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb

In [2]:
# Set parameters for XGBoost regressor

# set parameter eta range from 0 to 1 with step 0,1
eta = [x for x in np.arange(0, 1, 0.1)]

# set parameter gamma range from 0 to 100 with step 10
gamma = [x for x in np.arange(0, 100, 10)]

# set parameter max_depth range from 1 to 15 with step 0,5
#max_depth = [x for x in np.arange(1, 15, 1)]

# set parameter min_child_weight range from 1 to 6 with step 0,5
min_child_weight = [x for x in np.arange(0, 100, 10)]

# set parameter max_delta_step range from 0 to 10 with step 1
#max_delta_step = [x for x in np.arange(0, 10, 1)]

# set parameter subsample range from 0 to 1 with step 0,1
#subsample = [x for x in np.arange(0, 1, 0.1)]

# set lambda range from 0 to 10 with step 1
lambda_ = [x for x in np.arange(0, 10, 1)]

# set alpha range from 0 to 10 with step 1
alpha = [x for x in np.arange(0, 10, 1)]


# Create the random grid
random_grid = {'eta': eta,
                'gamma': gamma,
                #'max_depth': max_depth,
                'min_child_weight': min_child_weight,
                #'max_delta_step': max_delta_step,
                #'subsample': subsample,
                'lambda': lambda_,
                'alpha': alpha}


In [3]:
#Define MASE Metric
def mean_absolute_scaled_error(y_true, y_pred):
    mase=0
    # Define numerator as the forecast error
    numerator = (np.abs(y_true - y_pred))

    # Define denominator as the mean absolute error of the in-sample one-step naive forecast
    y_true_ohne_1 = y_true[1:].reset_index(drop=True)
    y_true_ohne_ende = y_true[:-1].reset_index(drop=True)
    denominator = np.mean(np.abs(y_true_ohne_1 - y_true_ohne_ende))

    mase = np.mean(np.abs(numerator / denominator))

    return mase

scorer_mase= make_scorer(mean_absolute_scaled_error, greater_is_better=False)


In [4]:
# Load the data
data = pd.read_parquet('/Users/paddy/Documents/GitHub/Masterthesis_ML/data/03_30min_dataset.parquet')

# Convert the date column to datetime
data['date'] = pd.to_datetime(data['date']) #,format='%d/%m/%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S') 


In [5]:
#Feature engineering
# Create a new column for the time
data['time'] = [x for x in range(0, len(data))]

# Create the lagged features
lags = 48 + 1

# Create the lagged features
for i in range(1, lags):
    data['lag' + str(i)] = data['count'].shift(i)

# Drop the first three rows
data = data.dropna().reset_index(drop=True)

# Define the feature columns and the target column
feature_cols = [ f'lag{i}' for i in range(1, lags)]
target_col = 'y'

# Drop nan values
data = data.dropna()

# Rename column count to y
data = data.rename(columns={'count': 'y'})

In [6]:
# Set y to the last column
cols = list(data.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('y')) #Remove y from list
data = data[cols+['y']] #Create new dataframe with columns in the order you want


# drop the date column
train_data = np.delete(data, 0, 1) 

# Split the data into X and y
X_train, y_train = train_data[:, :-1], train_data[:, -1]


In [7]:
#vgl: https://lightrun.com/answers/scikit-learn-scikit-learn-grid_search-feeding-parameters-to-scorer-functions

# X and y to pandas dataframe
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

#convert to float
X_train = X_train.astype(float)
y_train = y_train.astype(float)


In [8]:
# Set Cross Validation to 5 iterations
tscv = TimeSeriesSplit(n_splits=5)

In [9]:
# Instantiate model
model = xgb.XGBRegressor()

search = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid, 
                               n_iter = 25, 
                               cv = tscv,
                               refit=True, 
                               verbose=3, 
                               random_state=42, 
                               n_jobs = -1, 
                               scoring=scorer_mase, #make_scorer(scorer_mase, greater_is_better=True), #'neg_root_mean_squared_error', #
                               error_score=np.nan)


In [10]:
# Fit the random search model
search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[CV 1/5] END alpha=1, eta=0.5, gamma=70, lambda=9, min_child_weight=50;, score=-0.814 total time=   0.1s
[CV 3/5] END alpha=1, eta=0.5, gamma=70, lambda=9, min_child_weight=50;, score=-0.792 total time=   0.1s
[CV 2/5] END alpha=1, eta=0.5, gamma=70, lambda=9, min_child_weight=50;, score=-0.849 total time=   0.1s
[CV 4/5] END alpha=1, eta=0.5, gamma=70, lambda=9, min_child_weight=50;, score=-0.789 total time=   0.2s
[CV 5/5] END alpha=1, eta=0.5, gamma=70, lambda=9, min_child_weight=50;, score=-0.759 total time=   0.2s
[CV 1/5] END alpha=7, eta=0.6000000000000001, gamma=80, lambda=2, min_child_weight=0;, score=-0.829 total time=   0.1s
[CV 2/5] END alpha=7, eta=0.6000000000000001, gamma=80, lambda=2, min_child_weight=0;, score=-0.862 total time=   0.1s
[CV 3/5] END alpha=7, eta=0.6000000000000001, gamma=80, lambda=2, min_child_weight=0;, score=-0.803 total time=   0.1s
[CV 4/5] END alpha=7, eta=0.6000000000000001, gamma=80, lambda=2, min_child_weight=0;, score=-0.794 total time=   0.2s

In [11]:
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)
print('Best Model: %s' % search.best_estimator_)
print('Best Index: %s' % search.best_index_)
print('CV Results: %s' % search.cv_results_)
print('Refit Time: %s' % search.refit_time_)
print('Scorer: %s' % search.scorer_)

Best Score: -0.7728952737442552
Best Hyperparameters: {'min_child_weight': 20, 'lambda': 3, 'gamma': 90, 'eta': 0.1, 'alpha': 7}
Best Model: XGBRegressor(alpha=7, base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric=None,
             feature_types=None, gamma=90, grow_policy=None,
             importance_type=None, interaction_constraints=None, lambda=3,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=20, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None, ...)
Best Index: 24
CV Results: {'mean_fit_time': array([0.11615577, 0.41485047, 0.12273989, 0.12224979, 0.1464704 ,
       0.14722495, 0.18628955, 0.13064113, 0.