In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid

In [2]:
# Set parameters for linear regression

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = [1.0, 'sqrt', 'log2']
# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
#max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

warm_start = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               #'max_depth': max_depth,
               #'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'warm_start': warm_start}

grid = ParameterGrid(random_grid)
cnt = 0
for p in grid:
    cnt = cnt+1

print('Total Possible Models',cnt)


Total Possible Models 360


In [3]:
#Define MASE Metric
def mean_absolute_scaled_error(y_true, y_pred):
    mase=0
    # Define numerator as the forecast error
    numerator = (np.abs(y_true - y_pred))

    # Define denominator as the mean absolute error of the in-sample one-step naive forecast
    y_true_ohne_1 = y_true[1:].reset_index(drop=True)
    y_true_ohne_ende = y_true[:-1].reset_index(drop=True)
    denominator = np.mean(np.abs(y_true_ohne_1 - y_true_ohne_ende))

    mase = np.mean(np.abs(numerator / denominator))

    return mase

scorer_mase= make_scorer(mean_absolute_scaled_error, greater_is_better=False)


In [4]:
# Load the data
data = pd.read_parquet('/Users/paddy/Documents/GitHub/Masterthesis_ML/data/03_15min_dataset.parquet')

# Convert the date column to datetime
data['date'] = pd.to_datetime(data['date']) #,format='%d/%m/%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S') 


In [5]:
#Feature engineering
# Create a new column for the time
data['time'] = [x for x in range(0, len(data))]

# Drop nan values
data = data.dropna()

# Rename column count to y
data = data.rename(columns={'count': 'y'})

In [6]:
# Set y to the last column
cols = list(data.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('y')) #Remove y from list
data = data[cols+['y']] #Create new dataframe with columns in the order you want

# Define the feature columns and the target column
feature_cols = ['time']
target_col = 'y'

# drop the date column
train_data = np.delete(data, 0, 1) 

# Split the data into X and y
X_train, y_train = train_data[:, :-1], train_data[:, -1]


In [7]:
#vgl: https://lightrun.com/answers/scikit-learn-scikit-learn-grid_search-feeding-parameters-to-scorer-functions

# X and y to pandas dataframe
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)


In [8]:
# Set Cross Validation to 5 iterations
tscv = TimeSeriesSplit(n_splits=5)

In [9]:
# Instantiate model
model = RandomForestRegressor()

search = RandomizedSearchCV(estimator = model, 
                            param_distributions = random_grid, 
                            n_iter = 25, 
                            cv = tscv,
                            refit=True, 
                            verbose=3, 
                            random_state=42, 
                            n_jobs = -1, 
                            scoring=scorer_mase, #make_scorer(scorer_mase, greater_is_better=True), #'neg_root_mean_squared_error', #
                            error_score=np.nan)


In [10]:
# Fit the random search model
search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[CV 1/5] END bootstrap=True, max_features=1.0, min_samples_leaf=4, n_estimators=64, warm_start=True;, score=-1.475 total time=   1.0s
[CV 2/5] END bootstrap=True, max_features=1.0, min_samples_leaf=4, n_estimators=64, warm_start=True;, score=-1.523 total time=   1.8s
[CV 1/5] END bootstrap=False, max_features=1.0, min_samples_leaf=4, n_estimators=118, warm_start=True;, score=-1.538 total time=   3.2s
[CV 3/5] END bootstrap=True, max_features=1.0, min_samples_leaf=4, n_estimators=64, warm_start=True;, score=-2.543 total time=   3.6s
[CV 2/5] END bootstrap=False, max_features=1.0, min_samples_leaf=4, n_estimators=118, warm_start=True;, score=-1.623 total time=   5.4s
[CV 4/5] END bootstrap=True, max_features=1.0, min_samples_leaf=4, n_estimators=64, warm_start=True;, score=-1.533 total time=   4.3s
[CV 1/5] END bootstrap=False, max_features=sqrt, min_samples_leaf=4, n_estimators=118, warm_start=False;, score=-1.538 total time=   2.8s
[CV 5/5] END bootstrap=True, max_features=1.0, min_sam



[CV 3/5] END bootstrap=True, max_features=1.0, min_samples_leaf=4, n_estimators=445, warm_start=False;, score=-2.451 total time=  27.4s
[CV 2/5] END bootstrap=True, max_features=log2, min_samples_leaf=1, n_estimators=118, warm_start=True;, score=-1.475 total time=   3.4s
[CV 3/5] END bootstrap=True, max_features=log2, min_samples_leaf=1, n_estimators=118, warm_start=True;, score=-2.474 total time=   7.9s
[CV 1/5] END bootstrap=True, max_features=sqrt, min_samples_leaf=2, n_estimators=64, warm_start=True;, score=-1.427 total time=   0.8s
[CV 4/5] END bootstrap=True, max_features=log2, min_samples_leaf=1, n_estimators=118, warm_start=True;, score=-1.522 total time=   8.8s
[CV 2/5] END bootstrap=True, max_features=sqrt, min_samples_leaf=2, n_estimators=64, warm_start=True;, score=-1.596 total time=   1.9s
[CV 4/5] END bootstrap=True, max_features=1.0, min_samples_leaf=4, n_estimators=445, warm_start=False;, score=-1.520 total time=  33.2s
[CV 5/5] END bootstrap=True, max_features=log2, mi

In [11]:
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)
print('Best Model: %s' % search.best_estimator_)
print('Best Index: %s' % search.best_index_)
print('CV Results: %s' % search.cv_results_)
print('Refit Time: %s' % search.refit_time_)
print('Scorer: %s' % search.scorer_)

Best Score: -1.6430384344365294
Best Hyperparameters: {'warm_start': True, 'n_estimators': 64, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}
Best Model: RandomForestRegressor(bootstrap=False, max_features='log2', n_estimators=64,
                      warm_start=True)
Best Index: 3
CV Results: {'mean_fit_time': array([10.81496973,  3.07162871, 10.71447973,  6.24638677, 21.47576098,
       30.9874146 , 21.20010138, 23.2517818 , 22.49274583,  6.37875676,
        3.32053976, 37.51350036, 28.05585389, 51.26398525, 27.21929245,
       10.28269086,  6.64564323,  1.07454014, 29.70121408, 44.05998158,
        0.56690474,  6.4626389 , 12.07586379, 18.33156376, 33.17904754]), 'std_fit_time': array([ 5.46594498,  1.45229919,  5.68895241,  3.12242652, 10.92367747,
       16.23160549, 11.49214335, 12.11856602, 11.12531345,  3.26979772,
        1.71068267, 19.31920593, 14.67116302, 26.5328159 , 14.16510546,
        5.30897054,  3.37178605,  0.55691507, 14.99511204, 22.01454712,
