In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import Lasso

In [3]:
# Define parameters for LASSO
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10]
#max_iter = [500, 1000, 10000, 100000]
selection = ['cyclic', 'random']
fit_intercept = [True, False]
tol = [0.0001, 0.001, 0.01, 0.1, 1, 10]
warm_start = [True, False]
#positive = [True, False]
#random_state = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
copy_X = [True, False]

# Create the random grid
random_grid = {'alpha': alpha,
               #'max_iter': max_iter,
               'selection': selection,
               'fit_intercept': fit_intercept,
               'tol': tol,
               'warm_start': warm_start,
               #'positive': positive,
               #'random_state': random_state,
               'copy_X': copy_X}

In [4]:
#Define MASE Metric
def mean_absolute_scaled_error(y_true, y_pred):
    mase=0
    # Define numerator as the forecast error
    numerator = (np.abs(y_true - y_pred))

    # Define denominator as the mean absolute error of the in-sample one-step naive forecast
    y_true_ohne_1 = y_true[1:].reset_index(drop=True)
    y_true_ohne_ende = y_true[:-1].reset_index(drop=True)
    denominator = np.mean(np.abs(y_true_ohne_1 - y_true_ohne_ende))

    mase = np.mean(np.abs(numerator / denominator))

    return mase

scorer_mase= make_scorer(mean_absolute_scaled_error, greater_is_better=False)


In [5]:
# Load the data
data = pd.read_parquet('/Users/paddy/Documents/GitHub/Masterthesis_ML/data/03_15min_dataset.parquet')

# Convert the date column to datetime
data['date'] = pd.to_datetime(data['date']) #,format='%d/%m/%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S') 


In [6]:
#Feature engineering
# Create a new column for the time
data['time'] = [x for x in range(0, len(data))]

data['hour_of_day'] = data['date'].dt.hour
data['day_of_week'] = data['date'].dt.dayofweek
data['day_of_month'] = data['date'].dt.day
data['month_of_year'] = data['date'].dt.month
data['year'] = data['date'].dt.year

# make a weekend column
data['weekend'] = 0
data.loc[data['day_of_week'] == 5, 'weekend'] = 1
data.loc[data['day_of_week'] == 6, 'weekend'] = 1

#make a monday column
data['monday'] = 0
data.loc[data['day_of_week'] == 0, 'monday'] = 1

#make a tuesday column
data['tuesday'] = 0
data.loc[data['day_of_week'] == 1, 'tuesday'] = 1

#make a wednesday column
data['wednesday'] = 0
data.loc[data['day_of_week'] == 2, 'wednesday'] = 1

#make a thursday column
data['thursday'] = 0
data.loc[data['day_of_week'] == 3, 'thursday'] = 1

#make a friday column
data['friday'] = 0
data.loc[data['day_of_week'] == 4, 'friday'] = 1

#make a saturday column
data['saturday'] = 0
data.loc[data['day_of_week'] == 5, 'saturday'] = 1

#make a sunday column
data['sunday'] = 0
data.loc[data['day_of_week'] == 6, 'sunday'] = 1


# Drop the first three rows
data = data.dropna().reset_index(drop=True)

# Define the feature columns and the target column
feature_cols = ['time', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_of_year', 'year', 'weekend', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
target_col = 'y'

# Drop nan values
data = data.dropna()

# Rename column count to y
data = data.rename(columns={'count': 'y'})

print(data.head())

                       date  y  time  hour_of_day  day_of_week  day_of_month  \
0 2022-04-01 00:00:00+00:00  8     0            0            4             1   
1 2022-04-01 00:15:00+00:00  1     1            0            4             1   
2 2022-04-01 00:30:00+00:00  9     2            0            4             1   
3 2022-04-01 00:45:00+00:00  8     3            0            4             1   
4 2022-04-01 01:00:00+00:00  4     4            1            4             1   

   month_of_year  year  weekend  monday  tuesday  wednesday  thursday  friday  \
0              4  2022        0       0        0          0         0       1   
1              4  2022        0       0        0          0         0       1   
2              4  2022        0       0        0          0         0       1   
3              4  2022        0       0        0          0         0       1   
4              4  2022        0       0        0          0         0       1   

   saturday  sunday  
0         

In [14]:
# Set y to the last column
cols = list(data.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('y')) #Remove y from list
data = data[cols+['y']] #Create new dataframe with columns in the order you want

# drop the date column
train_data = np.delete(data, 0, 1) 

# Split the data into X and y
X_train, y_train = train_data[:, :-1], train_data[:, -1]


[[0 0 4 ... 1 0 0]
 [1 0 4 ... 1 0 0]
 [2 0 4 ... 1 0 0]
 ...
 [35037 23 4 ... 1 0 0]
 [35038 23 4 ... 1 0 0]
 [35039 23 4 ... 1 0 0]]


In [8]:
#vgl: https://lightrun.com/answers/scikit-learn-scikit-learn-grid_search-feeding-parameters-to-scorer-functions

# X and y to pandas dataframe
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)


In [9]:
# Set Cross Validation to 5 iterations
tscv = TimeSeriesSplit(n_splits=5)

In [10]:
# Instantiate model
model = Lasso()

# Instantiate RandomizedSearchCV object
search = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid, 
                               n_iter = 25, 
                               cv = tscv,
                               refit=True, 
                               verbose=3, 
                               random_state=42, 
                               n_jobs = -1, 
                               scoring=scorer_mase, #make_scorer(scorer_mase, greater_is_better=True), #'neg_root_mean_squared_error', #
                               error_score=np.nan)


In [11]:
# Fit the random search model
search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[CV 2/5] END alpha=0.001, copy_X=True, fit_intercept=True, selection=random, tol=10, warm_start=True;, score=-1.250 total time=   0.0s
[CV 1/5] END alpha=0.001, copy_X=True, fit_intercept=True, selection=random, tol=10, warm_start=True;, score=-1.390 total time=   0.0s
[CV 1/5] END alpha=0.01, copy_X=True, fit_intercept=False, selection=random, tol=0.1, warm_start=True;, score=-1.175 total time=   0.1s
[CV 1/5] END alpha=0.1, copy_X=False, fit_intercept=True, selection=cyclic, tol=10, warm_start=True;, score=-1.208 total time=   0.0s
[CV 3/5] END alpha=0.001, copy_X=True, fit_intercept=True, selection=random, tol=10, warm_start=True;, score=-1.445 total time=   0.0s[CV 5/5] END alpha=0.001, copy_X=True, fit_intercept=True, selection=random, tol=10, warm_start=True;, score=-1.386 total time=   0.0s

[CV 4/5] END alpha=0.001, copy_X=True, fit_intercept=True, selection=random, tol=10, warm_start=True;, score=-1.191 total time=   0.0s
[CV 2/5] END alpha=0.01, copy_X=True, fit_intercept=Fal

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END alpha=0.1, copy_X=False, fit_intercept=False, selection=random, tol=1, warm_start=True;, score=-2.154 total time=   0.0s
[CV 3/5] END alpha=0.1, copy_X=False, fit_intercept=False, selection=random, tol=1, warm_start=True;, score=-1.330 total time=   0.0s
[CV 4/5] END alpha=0.1, copy_X=False, fit_intercept=False, selection=random, tol=1, warm_start=True;, score=-1.178 total time=   0.0s
[CV 1/5] END alpha=1, copy_X=False, fit_intercept=False, selection=random, tol=0.001, warm_start=True;, score=-1.214 total time=   0.0s
[CV 2/5] END alpha=0.0001, copy_X=True, fit_intercept=False, selection=cyclic, tol=0.01, warm_start=False;, score=-1.258 total time=   0.3s
[CV 5/5] END alpha=0.1, copy_X=False, fit_intercept=False, selection=random, tol=1, warm_start=True;, score=-1.187 total time=   0.0s
[CV 2/5] END alpha=1, copy_X=False, fit_intercept=False, selection=random, tol=0.001, warm_start=True;, score=-1.276 total time=   0.0s
[CV 3/5] END alpha=1, copy_X=False, fit_intercept=Fa

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 3/5] END alpha=0.1, copy_X=False, fit_intercept=True, selection=random, tol=1, warm_start=True;, score=-1.211 total time=   0.0s
[CV 4/5] END alpha=0.1, copy_X=False, fit_intercept=True, selection=random, tol=1, warm_start=True;, score=-1.185 total time=   0.0s
[CV 5/5] END alpha=0.01, copy_X=False, fit_intercept=False, selection=random, tol=0.01, warm_start=False;, score=-1.128 total time=   0.1s
[CV 4/5] END alpha=0.0001, copy_X=True, fit_intercept=False, selection=cyclic, tol=0.01, warm_start=False;, score=-1.335 total time=   0.6s
[CV 4/5] END alpha=0.01, copy_X=False, fit_intercept=False, selection=random, tol=0.01, warm_start=False;, score=-1.300 total time=   0.2s
[CV 5/5] END alpha=0.1, copy_X=False, fit_intercept=True, selection=random, tol=1, warm_start=True;, score=-1.139 total time=   0.0s
[CV 1/5] END alpha=10, copy_X=False, fit_intercept=True, selection=cyclic, tol=0.1, warm_start=True;, score=-1.361 total time=   0.0s
[CV 3/5] END alpha=0.01, copy_X=False, fit_interc

  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END alpha=0.0001, copy_X=True, fit_intercept=False, selection=cyclic, tol=0.01, warm_start=False;, score=-1.127 total time=   0.8s
[CV 1/5] END alpha=0.01, copy_X=False, fit_intercept=False, selection=cyclic, tol=0.0001, warm_start=False;, score=-1.175 total time=   0.0s
[CV 2/5] END alpha=0.01, copy_X=False, fit_intercept=False, selection=cyclic, tol=0.0001, warm_start=False;, score=-1.257 total time=   0.1s
[CV 3/5] END alpha=0.01, copy_X=False, fit_intercept=False, selection=cyclic, tol=0.0001, warm_start=False;, score=-1.208 total time=   0.1s
[CV 5/5] END alpha=0.01, copy_X=False, fit_intercept=False, selection=cyclic, tol=0.0001, warm_start=False;, score=-1.128 total time=   0.1s
[CV 1/5] END alpha=0.0001, copy_X=False, fit_intercept=True, selection=cyclic, tol=0.1, warm_start=False;, score=-1.175 total time=   0.1s
[CV 1/5] END alpha=10, copy_X=True, fit_intercept=False, selection=random, tol=1, warm_start=True;, score=-1.245 total time=   0.0s
[CV 2/5] END alpha=10, co

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END alpha=10, copy_X=False, fit_intercept=False, selection=random, tol=0.0001, warm_start=False;, score=-1.361 total time=   0.0s
[CV 5/5] END alpha=10, copy_X=False, fit_intercept=False, selection=random, tol=0.0001, warm_start=False;, score=-1.306 total time=   0.0s
[CV 4/5] END alpha=0.0001, copy_X=False, fit_intercept=True, selection=cyclic, tol=0.1, warm_start=False;, score=-1.174 total time=   0.4s
[CV 3/5] END alpha=0.0001, copy_X=False, fit_intercept=True, selection=cyclic, tol=0.1, warm_start=False;, score=-1.208 total time=   0.4s
[CV 1/5] END alpha=1, copy_X=True, fit_intercept=True, selection=random, tol=1, warm_start=False;, score=-1.211 total time=   0.0s
[CV 1/5] END alpha=0.0001, copy_X=False, fit_intercept=False, selection=random, tol=0.1, warm_start=True;, score=-1.175 total time=   0.1s
[CV 2/5] END alpha=1, copy_X=True, fit_intercept=True, selection=random, tol=1, warm_start=False;, score=-1.276 total time=   0.0s
[CV 3/5] END alpha=1, copy_X=True, fit_inte

  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END alpha=1, copy_X=True, fit_intercept=True, selection=random, tol=1, warm_start=False;, score=-1.156 total time=   0.0s
[CV 5/5] END alpha=0.0001, copy_X=False, fit_intercept=True, selection=cyclic, tol=0.1, warm_start=False;, score=-1.127 total time=   0.6s
[CV 1/5] END alpha=0.01, copy_X=True, fit_intercept=False, selection=random, tol=0.0001, warm_start=True;, score=-1.175 total time=   0.1s
[CV 2/5] END alpha=0.0001, copy_X=False, fit_intercept=False, selection=random, tol=0.1, warm_start=True;, score=-1.257 total time=   0.3s
[CV 1/5] END alpha=10, copy_X=True, fit_intercept=False, selection=cyclic, tol=10, warm_start=True;, score=-2.804 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END alpha=0.01, copy_X=True, fit_intercept=False, selection=random, tol=0.0001, warm_start=True;, score=-1.257 total time=   0.2s
[CV 2/5] END alpha=10, copy_X=True, fit_intercept=False, selection=cyclic, tol=10, warm_start=True;, score=-2.506 total time=   0.0s
[CV 3/5] END alpha=10, copy_X=True, fit_intercept=False, selection=cyclic, tol=10, warm_start=True;, score=-2.249 total time=   0.0s
[CV 4/5] END alpha=10, copy_X=True, fit_intercept=False, selection=cyclic, tol=10, warm_start=True;, score=-2.043 total time=   0.0s
[CV 3/5] END alpha=0.01, copy_X=True, fit_intercept=False, selection=random, tol=0.0001, warm_start=True;, score=-1.208 total time=   0.3s
[CV 5/5] END alpha=10, copy_X=True, fit_intercept=False, selection=cyclic, tol=10, warm_start=True;, score=-1.826 total time=   0.0s
[CV 3/5] END alpha=0.0001, copy_X=False, fit_intercept=False, selection=random, tol=0.1, warm_start=True;, score=-1.208 total time=   0.5s
[CV 5/5] END alpha=0.01, copy_X=True, fit_intercept

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END alpha=0.0001, copy_X=False, fit_intercept=False, selection=random, tol=0.1, warm_start=True;, score=-1.415 total time=   0.6s
[CV 5/5] END alpha=0.0001, copy_X=False, fit_intercept=False, selection=random, tol=0.1, warm_start=True;, score=-1.127 total time=   0.7s


In [12]:
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)
print('Best Model: %s' % search.best_estimator_)
print('Best Index: %s' % search.best_index_)
print('CV Results: %s' % search.cv_results_)
print('Refit Time: %s' % search.refit_time_)
print('Scorer: %s' % search.scorer_)

Best Score: -1.1884723020480603
Best Hyperparameters: {'warm_start': True, 'tol': 0.1, 'selection': 'random', 'fit_intercept': False, 'copy_X': True, 'alpha': 0.01}
Best Model: Lasso(alpha=0.01, fit_intercept=False, selection='random', tol=0.1,
      warm_start=True)
Best Index: 0
CV Results: {'mean_fit_time': array([0.20230336, 0.01973467, 0.01286464, 0.01205797, 0.0120935 ,
       0.01207685, 0.01598716, 0.44771676, 0.01251364, 0.01823215,
       0.02822599, 0.01173768, 0.16104641, 0.01366487, 0.01165557,
       0.01267858, 0.11152334, 0.34896798, 0.01869698, 0.01640182,
       0.02750273, 0.45132518, 0.01600237, 0.2139596 , 0.01280265]), 'std_fit_time': array([0.03853148, 0.01297029, 0.00350412, 0.00468362, 0.00490226,
       0.00509114, 0.00549363, 0.23849257, 0.00572655, 0.00998335,
       0.0126503 , 0.00476974, 0.06230539, 0.00517585, 0.00462655,
       0.00393931, 0.08431484, 0.17936416, 0.0085706 , 0.00406974,
       0.0133887 , 0.20019422, 0.00761008, 0.06214634, 0.00434028])