In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge

In [2]:
# Set parameters for linear regression

# ridge parameters for random search
fit_intercept = [True, False]
copy_X = [True, False]
alpha = [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
tol = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']

# Create the random grid
random_grid = {'fit_intercept': fit_intercept,
                'copy_X': copy_X,
                'alpha': alpha,
                'tol': tol,
                'solver': solver}


In [3]:
#Define MASE Metric
def mean_absolute_scaled_error(y_true, y_pred):
    mase=0
    # Define numerator as the forecast error
    numerator = (np.abs(y_true - y_pred))

    # Define denominator as the mean absolute error of the in-sample one-step naive forecast
    y_true_ohne_1 = y_true[1:].reset_index(drop=True)
    y_true_ohne_ende = y_true[:-1].reset_index(drop=True)
    denominator = np.mean(np.abs(y_true_ohne_1 - y_true_ohne_ende))

    mase = np.mean(np.abs(numerator / denominator))

    return mase

scorer_mase= make_scorer(mean_absolute_scaled_error, greater_is_better=False)


In [4]:
# Load the data
data = pd.read_parquet('/Users/paddy/Documents/GitHub/Masterthesis_ML/data/03_15min_dataset.parquet')

# Convert the date column to datetime
data['date'] = pd.to_datetime(data['date']) #,format='%d/%m/%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S') 


In [5]:
#Feature engineering
# Create a new column for the time
data['time'] = [x for x in range(0, len(data))]

# Create the lagged features
lags = 96 + 1

# Create the lagged features
for i in range(1, lags):
    data['lag' + str(i)] = data['count'].shift(i)

# Drop the first three rows
data = data.dropna().reset_index(drop=True)

# Define the feature columns and the target column
feature_cols = [ f'lag{i}' for i in range(1, lags)]
target_col = 'y'

# Drop nan values
data = data.dropna()

# Rename column count to y
data = data.rename(columns={'count': 'y'})

In [6]:
# Set y to the last column
cols = list(data.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('y')) #Remove y from list
data = data[cols+['y']] #Create new dataframe with columns in the order you want

# drop the date column
train_data = np.delete(data, 0, 1) 

# Split the data into X and y
X_train, y_train = train_data[:, :-1], train_data[:, -1]


In [7]:
#vgl: https://lightrun.com/answers/scikit-learn-scikit-learn-grid_search-feeding-parameters-to-scorer-functions

# X and y to pandas dataframe
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)


In [8]:
# Set Cross Validation to 5 iterations
tscv = TimeSeriesSplit(n_splits=5)

In [9]:
# Instantiate model
model = Ridge()

search = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid, 
                               n_iter = 25, 
                               cv = tscv,
                               refit=True, 
                               verbose=3, 
                               random_state=42, 
                               n_jobs = -1, 
                               scoring=scorer_mase, #make_scorer(scorer_mase, greater_is_better=True), #'neg_root_mean_squared_error', #
                               error_score=np.nan)


In [10]:
# Fit the random search model
search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[CV 1/5] END alpha=10, copy_X=False, fit_intercept=True, solver=sparse_cg, tol=0.001;, score=-0.845 total time=   0.1s
[CV 2/5] END alpha=10, copy_X=False, fit_intercept=True, solver=sparse_cg, tol=0.001;, score=-0.870 total time=   0.1s
[CV 3/5] END alpha=10, copy_X=False, fit_intercept=True, solver=sparse_cg, tol=0.001;, score=-0.869 total time=   0.1s
[CV 4/5] END alpha=0.5, copy_X=True, fit_intercept=True, solver=cholesky, tol=0.01;, score=-0.853 total time=   0.2s
[CV 4/5] END alpha=10, copy_X=False, fit_intercept=True, solver=sparse_cg, tol=0.001;, score=-0.852 total time=   0.2s
[CV 5/5] END alpha=0.5, copy_X=True, fit_intercept=True, solver=cholesky, tol=0.01;, score=-0.840 total time=   0.2s
[CV 5/5] END alpha=10, copy_X=False, fit_intercept=True, solver=sparse_cg, tol=0.001;, score=-0.840 total time=   0.2s
[CV 1/5] END alpha=100, copy_X=False, fit_intercept=False, solver=svd, tol=0.1;, score=-0.849 total time=   0.1s
[CV 1/5] END alpha=0.5, copy_X=True, fit_intercept=True, s



[CV 4/5] END alpha=0.5, copy_X=True, fit_intercept=True, solver=lsqr, tol=0.1;, score=-1.449 total time=   0.1s
[CV 3/5] END alpha=100, copy_X=False, fit_intercept=True, solver=sag, tol=0.0001;, score=-0.870 total time=  18.9s
[CV 5/5] END alpha=0.5, copy_X=True, fit_intercept=True, solver=lsqr, tol=0.1;, score=-1.393 total time=   0.1s
[CV 1/5] END alpha=5, copy_X=True, fit_intercept=False, solver=sparse_cg, tol=0.0001;, score=-0.846 total time=   0.0s
[CV 2/5] END alpha=5, copy_X=True, fit_intercept=False, solver=sparse_cg, tol=0.0001;, score=-0.882 total time=   0.1s
[CV 3/5] END alpha=5, copy_X=True, fit_intercept=False, solver=sparse_cg, tol=0.0001;, score=-0.926 total time=   0.1s
[CV 4/5] END alpha=5, copy_X=True, fit_intercept=False, solver=sparse_cg, tol=0.0001;, score=-0.921 total time=   0.1s
[CV 5/5] END alpha=5, copy_X=True, fit_intercept=False, solver=sparse_cg, tol=0.0001;, score=-0.908 total time=   0.1s
[CV 1/5] END alpha=0.5, copy_X=True, fit_intercept=True, solver=sv



[CV 4/5] END alpha=100, copy_X=False, fit_intercept=True, solver=sag, tol=0.0001;, score=-0.858 total time=  24.6s
[CV 5/5] END alpha=0.5, copy_X=True, fit_intercept=True, solver=saga, tol=0.001;, score=-0.888 total time=  16.3s
[CV 5/5] END alpha=100, copy_X=False, fit_intercept=True, solver=sag, tol=0.0001;, score=-0.848 total time=  28.0s




In [11]:
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)
print('Best Model: %s' % search.best_estimator_)
print('Best Index: %s' % search.best_index_)
print('CV Results: %s' % search.cv_results_)
print('Refit Time: %s' % search.refit_time_)
print('Scorer: %s' % search.scorer_)

Best Score: -0.8552473939589978
Best Hyperparameters: {'tol': 0.001, 'solver': 'sparse_cg', 'fit_intercept': True, 'copy_X': False, 'alpha': 10}
Best Model: Ridge(alpha=10, copy_X=False, solver='sparse_cg', tol=0.001)
Best Index: 0
CV Results: {'mean_fit_time': array([ 0.1274982 ,  0.11855192,  0.25274525,  0.07082443,  0.04788203,
        0.04612732, 16.94358249,  0.08463025,  0.11567349,  0.24465318,
        0.05598879,  0.0763773 ,  0.06335964,  0.08118172,  0.07859235,
        9.56926365,  0.06004119,  0.27363739,  0.09614501,  0.08553233,
        0.07939601,  0.06931405,  0.06435447,  0.27654943,  0.05910764]), 'std_fit_time': array([0.06760001, 0.03135465, 0.12993445, 0.02217076, 0.0188522 ,
       0.0181713 , 9.1783788 , 0.03937382, 0.10067227, 0.11046735,
       0.02050851, 0.03612003, 0.0285476 , 0.04305501, 0.04076087,
       5.65928045, 0.02330233, 0.14979919, 0.05028505, 0.02898132,
       0.0407299 , 0.02214356, 0.02771397, 0.17715043, 0.01832859]), 'mean_score_time': arra