In [55]:
# this notebook is designed to demonstrate how the experimental class works, 
# and what each step does
import sys
sys.path.append('..')

import pandas as pd
import numpy as np

from src.util import transformations
from src.util.ml_experiment import MLForecastingExperiment

from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor

pd.options.plotting.backend = "plotly"

In [56]:
# override base experiment so it can load in data correctly
class TempExperiment(MLForecastingExperiment):

    def _load_data(self):

        self.data = pd.read_csv(f'../data/{self.data_file}', parse_dates = ['date'])

        self._validate_data()

        self.data.set_index(['series', 'date'], inplace = True)

In [57]:
model_list = [
        Ridge(alpha = 1.0), 
        Lasso(alpha = 0.1),
        DecisionTreeRegressor(min_samples_leaf = 5),
]

In [78]:
# initialize experiment for the electricity dataset
m1 = TempExperiment(
    exp_name = 'm1',
    data_file = 'm1_monthly.csv',
    models = model_list,
    lags = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    calibration_windows = [6, 12, 30],
    date_parts_to_encode = ['month', 'quarter'],
    target_transform = 'log_diff_1_1',
    encode_entity = True,
    train_size = 500,
    training_step_size = 3
)

In [79]:
# look at results of each step
m1._load_data()

# and our data
m1.data

Unnamed: 0_level_0,Unnamed: 1_level_0,value
series,date,Unnamed: 2_level_1
T1,1975-12-01,697458.0
T1,1976-01-01,1187650.0
T1,1976-02-01,1069690.0
T1,1976-03-01,1078430.0
T1,1976-04-01,1059910.0
...,...,...
T617,1972-07-01,58.0
T617,1972-08-01,35.0
T617,1972-09-01,38.0
T617,1972-10-01,28.0


In [80]:
# create the results directory -- not too important for now
m1._create_results_directory()

In [81]:
# calculate the naive forecast
m1._calc_naive_forecast()

m1.data

Unnamed: 0_level_0,Unnamed: 1_level_0,value,naive_forecast
series,date,Unnamed: 2_level_1,Unnamed: 3_level_1
T1,1975-12-01,697458.0,
T1,1976-01-01,1187650.0,697458.0
T1,1976-02-01,1069690.0,1187650.0
T1,1976-03-01,1078430.0,1069690.0
T1,1976-04-01,1059910.0,1078430.0
...,...,...,...
T617,1972-07-01,58.0,43.0
T617,1972-08-01,35.0,58.0
T617,1972-09-01,38.0,35.0
T617,1972-10-01,28.0,38.0


In [82]:
# transform the target
m1._transform_target()

# new data
m1.data

Unnamed: 0_level_0,Unnamed: 1_level_0,value,naive_forecast,target
series,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T1,1975-12-01,697458.0,,
T1,1976-01-01,1187650.0,697458.0,
T1,1976-02-01,1069690.0,1187650.0,-0.636897
T1,1976-03-01,1078430.0,1069690.0,0.112745
T1,1976-04-01,1059910.0,1078430.0,-0.025460
...,...,...,...,...
T617,1972-07-01,58.0,43.0,0.120076
T617,1972-08-01,35.0,58.0,-0.787366
T617,1972-09-01,38.0,35.0,0.574061
T617,1972-10-01,28.0,38.0,-0.376309


In [83]:
# new data set
m1._create_X_y()

m1.data

Unnamed: 0_level_0,Unnamed: 1_level_0,value,naive_forecast,target,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,entity,month,quarter
series,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
T1,1977-02-01,1513480.0,937352.0,0.116663,0.632786,-1.004426,1.143253,-0.026688,-0.405524,-0.522686,1.270137,-0.722623,0.015538,-0.025460,0.112745,-0.636897,T1,2,1
T1,1977-03-01,1138080.0,1513480.0,-0.764176,0.116663,0.632786,-1.004426,1.143253,-0.026688,-0.405524,-0.522686,1.270137,-0.722623,0.015538,-0.025460,0.112745,T1,3,1
T1,1977-04-01,687150.0,1138080.0,-0.219476,-0.764176,0.116663,0.632786,-1.004426,1.143253,-0.026688,-0.405524,-0.522686,1.270137,-0.722623,0.015538,-0.025460,T1,4,2
T1,1977-05-01,1366120.0,687150.0,1.191721,-0.219476,-0.764176,0.116663,0.632786,-1.004426,1.143253,-0.026688,-0.405524,-0.522686,1.270137,-0.722623,0.015538,T1,5,2
T1,1977-06-01,1083180.0,1366120.0,-0.919250,1.191721,-0.219476,-0.764176,0.116663,0.632786,-1.004426,1.143253,-0.026688,-0.405524,-0.522686,1.270137,-0.722623,T1,6,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T617,1972-07-01,58.0,43.0,0.120076,-0.346604,1.066419,-0.782932,-0.220370,1.249989,-0.618877,-1.090644,1.321756,-0.022473,-0.405982,0.045979,-0.223660,T617,7,3
T617,1972-08-01,35.0,58.0,-0.787366,0.120076,-0.346604,1.066419,-0.782932,-0.220370,1.249989,-0.618877,-1.090644,1.321756,-0.022473,-0.405982,0.045979,T617,8,3
T617,1972-09-01,38.0,35.0,0.574061,-0.787366,0.120076,-0.346604,1.066419,-0.782932,-0.220370,1.249989,-0.618877,-1.090644,1.321756,-0.022473,-0.405982,T617,9,3
T617,1972-10-01,28.0,38.0,-0.376309,0.574061,-0.787366,0.120076,-0.346604,1.066419,-0.782932,-0.220370,1.249989,-0.618877,-1.090644,1.321756,-0.022473,T617,10,4


In [84]:
# build the encoding steps
m1._build_data_encoder()

m1.data_encoder

In [85]:
# run the model
results = m1._fit_data(model_list[2])

Fitting model w/ a window size of: 6
Fitting model w/ a window size of: 12
Fitting model w/ a window size of: 30


In [86]:
results.dropna()

Unnamed: 0,series,date,y_pred_6,y_pred_12,y_pred_30,y_true
8,T1,1979-08-01,-0.610108,-1.445307,-1.315335,-1.621743
9,T1,1979-11-01,-0.493887,-0.534323,-0.484465,-1.527742
18,T10,1979-01-01,-0.227863,-0.431246,0.180444,-0.037932
19,T10,1979-04-01,-0.049783,-0.046287,-0.365018,-0.493144
28,T100,1981-08-01,-0.059284,0.064093,0.208484,0.212889
...,...,...,...,...,...,...
6149,T97,1981-11-01,0.977138,0.858155,0.455156,1.266110
6158,T98,1981-08-01,0.009007,0.026919,0.118384,-0.021722
6159,T98,1981-11-01,0.024040,-0.036493,0.023988,0.036360
6168,T99,1981-08-01,0.009007,0.026919,0.118384,-0.021722


In [87]:
res = m1._inverse_transform_target(results)

In [92]:
?pd.DataFrame

[0;31mInit signature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0;34m'Axes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m'Axes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'Dtype | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m:[0m [0;34m'bool | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'None'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns).
Arithmetic operations align on both row and column labels. Can be
thought of as a dict-like container for Series ob

In [88]:
res

Unnamed: 0,series,date,y_pred_6,y_pred_12,y_pred_30,y_true,value
8,T1,1979-08-01,333620.735202,144720.216213,164806.868785,-1.621743,630177.0
9,T1,1979-11-01,109186.866542,19730.896456,26896.578672,-1.527742,622073.0
18,T10,1979-01-01,3481.687912,2840.753248,5237.898844,-0.037932,5214.0
19,T10,1979-04-01,2415.262947,1613.382418,3988.242804,-0.493144,4775.0
28,T100,1981-08-01,440.418236,498.381337,575.953397,0.212889,403.0
...,...,...,...,...,...,...,...
6149,T97,1981-11-01,931.245584,956.407160,909.924727,1.266110,156.0
6158,T98,1981-08-01,398.253761,405.469645,444.400417,-0.021722,428.0
6159,T98,1981-11-01,395.315544,385.643155,492.198496,0.036360,411.0
6168,T99,1981-08-01,398.253761,405.469645,444.400417,-0.021722,428.0


In [54]:
res.loc[res.series == 'T2']

Unnamed: 0,series,date,y_pred_6,y_pred_12,y_pred_48,y_pred_72,y_true,value
5128,T2,2013-08-18,22211.0,22211.0,22211.0,22211.0,-0.028712,22211.0
5129,T2,2013-09-08,20957.671011,22532.272914,21321.431796,21148.093516,0.002925,22253.0
5130,T2,2013-09-29,21120.095236,24508.213592,21731.542303,21736.971763,-0.035494,19511.0
5131,T2,2013-10-20,20729.179919,25587.76929,20722.382783,22399.083422,-0.037233,19058.0
5132,T2,2013-11-10,21001.33399,25615.050949,20355.134156,22116.147065,0.007491,19027.0
5133,T2,2013-12-01,20690.790775,25259.973868,20928.487889,22203.32937,0.013539,19408.0
5134,T2,2013-12-22,21037.254468,25821.761304,21598.143446,22993.197899,-0.004613,18598.0
5135,T2,2014-01-12,20887.049553,26056.728551,21195.544584,23505.930737,0.004373,19481.0
5136,T2,2014-02-02,21579.7464,26891.002383,21751.759197,23917.050187,-0.113234,17648.0
5137,T2,2014-02-23,21303.394198,28004.585106,20475.541809,24150.484265,-0.065304,15751.0


In [77]:
results = res.merge(m1.data[['value']], 
                        left_on = ['series', 'date'], 
                        right_index = True, how = 'left')

results = results.dropna()

In [None]:
transformations.transform_target_prediction(results, 
                                      entity_col = 'series',
                                      transform_function = transformations.transform_log_difference,
                                      target_col = 'y_true',
                                      value_col = 'value')

pred_cols = [col for col in results if 'pred' in col]

for col in pred_cols:
    results[col] = transformations.transform_target_prediction(results,
                                                               entity_col = 'series',
                                                               transform_function = transformations.transform_log_difference,
                                                               target_col = col,
                                                               value_col = 'value')

In [None]:
results.loc[results.series == 'T45'].plot(x = 'date', y = ['y_pred_3', 'y_pred_72', 'value'])

In [None]:
results['ensemble1'] = results[['y_pred_3', 'y_pred_72']].mean(axis = 1)
results['ensemble2'] = results[['y_pred_12', 'y_pred_72']].mean(axis = 1)
results['ensemble3'] = results[pred_cols].mean(axis = 1)

In [89]:
metrics = m1._calculate_error_metrics(res)

In [91]:
res

Unnamed: 0,series,date,y_pred_6,y_pred_12,y_pred_30,y_true,value
8,T1,1979-08-01,333620.735202,144720.216213,164806.868785,-1.621743,630177.0
9,T1,1979-11-01,109186.866542,19730.896456,26896.578672,-1.527742,622073.0
18,T10,1979-01-01,3481.687912,2840.753248,5237.898844,-0.037932,5214.0
19,T10,1979-04-01,2415.262947,1613.382418,3988.242804,-0.493144,4775.0
28,T100,1981-08-01,440.418236,498.381337,575.953397,0.212889,403.0
...,...,...,...,...,...,...,...
6149,T97,1981-11-01,931.245584,956.407160,909.924727,1.266110,156.0
6158,T98,1981-08-01,398.253761,405.469645,444.400417,-0.021722,428.0
6159,T98,1981-11-01,395.315544,385.643155,492.198496,0.036360,411.0
6168,T99,1981-08-01,398.253761,405.469645,444.400417,-0.021722,428.0


In [90]:
metrics

Unnamed: 0,model,mae,rmse,mape,r2
0,y_pred_6,52561.573627,548705.467561,3.561959,-219.816219
1,y_pred_12,63094.963975,783457.107158,7.492122,-449.176524
2,y_pred_30,83644.733052,963718.002961,11.521603,-680.165022
3,naive_forecast,1468.667123,24237.663965,0.156371,0.754921


In [None]:
dir(electricity)

In [None]:
metrics

In [None]:
results

In [None]:
# now -- transform back to original values -- for each column!
# step 1:  have true value column back in the data -- merge in
# step 2:  run conversion function, look at values