In [10]:
import sys
sys.path.append("/home/hugo/projetos-doutorado/Appliance-Energy-Prediction")
import pandas as pd
from pca_fts.PcaSarimax import PcaSarimax
from pyFTS.benchmarks import Measures
import matplotlib.pyplot as plt
from pyFTS.common import Util
import datetime
import statistics
import math

## Aux functions

In [11]:
def sample_first_prows(data, perc=0.75):
    return data.head(int(len(data)*(perc)))

In [12]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

## Dataset Energy Appliances

In [13]:
filename = '/home/hugo/projetos-doutorado/Appliance-Energy-Prediction/data/energydata_complete.csv'
data = pd.read_csv(filename)
data.pop('date')
data.pop('rv1')
data.pop('rv2')
data.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint
0,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3
1,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2
2,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1
3,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0
4,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9


In [14]:
values = data.values
values

array([[ 60.        ,  30.        ,  19.89      , ...,   7.        ,
         63.        ,   5.3       ],
       [ 60.        ,  30.        ,  19.89      , ...,   6.66666667,
         59.16666667,   5.2       ],
       [ 50.        ,  30.        ,  19.89      , ...,   6.33333333,
         55.33333333,   5.1       ],
       ...,
       [270.        ,  10.        ,  25.5       , ...,   3.66666667,
         25.33333333,  13.26666667],
       [420.        ,  10.        ,  25.5       , ...,   3.83333333,
         26.16666667,  13.23333333],
       [430.        ,  10.        ,  25.5       , ...,   4.        ,
         27.        ,  13.2       ]])

### Dataset Energy Appliances: columns

In [15]:
cols = data.columns
list_cols = list(cols)
list_cols.append('Appliances_t+1')
print(list_cols)

['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'Appliances_t+1']


## Dataset Energy Appliance: Preprocessing, Lag = 1, t+1

In [16]:
# frame as supervised learning
reframed = series_to_supervised(values, 1, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]], axis=1, inplace=True)
#reframed.drop(reframed.columns[[0,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]], axis=1, inplace=True)
reframed.head()

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var10(t-1),...,var18(t-1),var19(t-1),var20(t-1),var21(t-1),var22(t-1),var23(t-1),var24(t-1),var25(t-1),var26(t-1),var1(t)
1,60.0,30.0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,60.0
2,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,50.0
3,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,50.0
4,50.0,40.0,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,60.0
5,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,50.0


In [17]:
data = pd.DataFrame(reframed.values,columns=list_cols)
data.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,Appliances_t+1
0,60.0,30.0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,60.0
1,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,50.0
2,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,50.0
3,50.0,40.0,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,60.0
4,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,50.0


## Model: PcaSarimax - PCA-SARIMAX

### Define SARIMAX parameters 

In [18]:
pca_sarimax = PcaSarimax(n_components = 2,
                       endogen_variable = 'Appliances_t+1',
                       order = [3, 1, 2],
                       seasonal_order = [0,0,0,0])

In [19]:
train = sample_first_prows(data, perc=0.30)

In [20]:
reduced = pca_sarimax.apply_pca(train)
reduced.head(2)

Unnamed: 0,x,y,Appliances_t+1
0,-0.381911,0.313501,60.0
1,-0.37434,0.307751,50.0


In [21]:
exog = reduced.drop(labels=['Appliances_t+1'], axis=1)
exog.head(2)

Unnamed: 0,x,y
0,-0.381911,0.313501
1,-0.37434,0.307751


In [22]:
endog = reduced['Appliances_t+1']
endog.head(3)

0    60.0
1    50.0
2    50.0
Name: Appliances_t+1, dtype: float64

In [None]:
# Define arimax paraters using the embedded data 
from pmdarima.arima import auto_arima
# sarimax_model = auto_arima(endog,
#                        exogenous= exog,
#                        start_p=1,
#                        start_q=1,
#                        max_p=3,
#                        max_q=3,
#                        m=12,
#                        seasonal=True,
#                        d=None,
#                        D=1,
#                        trace=True,
#                        error_action='ignore',
#                        suppress_warnings=True,
#                        stepwise=True)

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(1,1,1)[12] intercept   : AIC=inf, Time=96.68 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=123755.582, Time=1.28 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=115545.609, Time=30.81 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=79.30 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=123753.619, Time=8.18 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=118081.613, Time=2.47 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=114529.897, Time=65.44 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=142.28 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=79.64 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=121775.581, Time=104.61 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=114443.738, Time=78.58 sec
 ARIMA(2,0,0)(1,1,0)[12] intercept   : AIC=115446.443, Time=34.68 sec


In [35]:
# Best model:  ARIMA(3,1,1)(0,0,0)[0]          
# Total fit time: 327.538 seconds
sarimax_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,19734.0
Model:,"SARIMAX(3, 1, 1)",Log Likelihood,-110925.728
Date:,"Sat, 19 Jun 2021",AIC,221865.455
Time:,21:57:32,BIC,221920.686
Sample:,0,HQIC,221883.537
,- 19734,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x,-275.2184,18.121,-15.188,0.000,-310.734,-239.703
y,-461.4016,19.961,-23.115,0.000,-500.524,-422.279
ar.L1,0.7001,0.007,104.994,0.000,0.687,0.713
ar.L2,-0.2046,0.004,-46.553,0.000,-0.213,-0.196
ar.L3,0.0685,0.005,12.844,0.000,0.058,0.079
ma.L1,-0.8748,0.006,-148.814,0.000,-0.886,-0.863
sigma2,4467.9016,14.190,314.874,0.000,4440.091,4495.713

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,369259.11
Prob(Q):,0.97,Prob(JB):,0.0
Heteroskedasticity (H):,0.8,Skew:,2.77
Prob(H) (two-sided):,0.0,Kurtosis:,23.46


### Test the model with the best parameters 

In [23]:
pca_arimax = PcaSarimax(n_components = 2,
                       endogen_variable = 'Appliances_t+1',
                       order=[0,0,1],
                       seasonal_order=[0,1,2,12])

In [24]:
train = sample_first_prows(data,0.75)
test = data.iloc[max(train.index):]
y_test = data.iloc[max(train.index):]['Appliances_t+1'].values

In [25]:
model, sarimax, pca_reduced_train = pca_arimax.run_train_model(train)



In [28]:
sarimax.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,14800.0
Model:,"SARIMAX(0, 0, 1)x(0, 1, [1, 2], 12)",Log Likelihood,-83760.205
Date:,"Sun, 20 Jun 2021",AIC,167580.409
Time:,01:11:36,BIC,167808.404
Sample:,0,HQIC,167656.114
,- 14800,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Appliances,0.6200,0.006,112.141,0.000,0.609,0.631
lights,0.5320,0.073,7.304,0.000,0.389,0.675
T1,-18.2417,2.044,-8.924,0.000,-22.248,-14.235
RH_1,2.5814,0.498,5.185,0.000,1.606,3.557
T2,9.0255,1.924,4.692,0.000,5.255,12.796
RH_2,-0.2553,0.742,-0.344,0.731,-1.710,1.200
T3,-9.3194,0.953,-9.776,0.000,-11.188,-7.451
RH_3,4.6974,0.765,6.138,0.000,3.197,6.198
T4,-5.6839,1.052,-5.405,0.000,-7.745,-3.623

0,1,2,3
Ljung-Box (L1) (Q):,21.77,Jarque-Bera (JB):,265181.79
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,0.88,Skew:,2.95
Prob(H) (two-sided):,0.0,Kurtosis:,22.91


In [29]:
pca_reduced_train.head(2)

Unnamed: 0,x,y,Appliances_t+1
0,-0.795656,0.394111,60.0
1,-0.793946,0.386976,50.0


In [30]:
start = len(train)
end = len(train) + len(test) -1
forecast, pca_reduced_test = pca_arimax.run_test_model(test, sarimax,start,end)

In [31]:
list(forecast)

[269.0879371047459,
 200.04214856940538,
 215.64515946346086,
 247.9554519365009,
 272.3808477628242,
 178.08887716946464,
 108.74521211848526,
 128.68807830985367,
 114.05414289350085,
 110.80572501011616,
 109.24924422474464,
 121.24813502753051,
 130.1465027544996,
 104.81635525438668,
 192.29775846846854,
 111.27229797016798,
 108.51319516569035,
 112.95922113753929,
 113.95404545338013,
 116.88535881685462,
 112.47850254644356,
 120.18346827936702,
 107.7431783461347,
 104.8853958164714,
 115.98694519974708,
 70.67340463907925,
 81.50601546550675,
 92.90229279434271,
 89.83779541555376,
 94.19201401036221,
 88.60369498229889,
 80.64481031992563,
 81.35638521157134,
 87.68024208373535,
 89.87115519093894,
 107.72011962994765,
 119.22691170524106,
 68.98389072071615,
 82.86870393023679,
 92.39843541917125,
 158.40123048171054,
 404.54168251577767,
 345.39338512149106,
 181.77239963940153,
 130.684994163816,
 133.18296599091988,
 136.88500394276235,
 130.2897836337277,
 135.499745030

In [32]:
pca_reduced_test['Appliances_t+1'].head(2)

0    280.0
1    280.0
Name: Appliances_t+1, dtype: float64

In [33]:
print(Measures.rmse(pca_reduced_test['Appliances_t+1'],forecast.values))
print(Measures.mape(pca_reduced_test['Appliances_t+1'],forecast.values))

65.51587735810429
50.569772285131684


## Model: PcaSarimax - PCA-SARIMAX - Final model 

In [35]:
pca_sarimax = PcaSarimax(n_components = 2,
                       endogen_variable = 'Appliances',
                       order=[0,0,1],
                       seasonal_order=[0,1,2,12])

## Sliding Window PyFTS: PCA-SARIMAX

In [36]:
result = {
     "window": [],
     "rmse": [],
     "mape": [],
     "smape": []
}

tam = len(data)
n_windows = 30
windows_length = math.floor(tam / n_windows)
for ct, ttrain, ttest in Util.sliding_window(data, windows_length, 0.75, inc=1):
    if len(ttest) > 0:
        
        start = len(ttrain)
        end = len(ttrain)+ len(ttest) -1
        appliance = ttest['Appliances_t+1'].reset_index()
        
        print('-' * 20)
        print(f'training window {(ct)}')
        model, sarimax, pca_reduced_train = pca_sarimax.run_train_model(ttrain)
        forecast, pca_reduced_test = pca_sarimax.run_test_model(ttest,sarimax,start,end)
        
        print("[{0: %H:%M:%S}]".format(datetime.datetime.now()) + f" getting statistics")
        rmse = Measures.rmse(list(appliance.iloc[:,1]),list(forecast))
        mape = Measures.mape(list(appliance.iloc[:,1]),list(forecast))
        smape = Measures.smape(list(appliance.iloc[:,1]),list(forecast))
        
        result["rmse"].append(rmse)
        result["mape"].append(mape)
        result["smape"].append(smape)
        result["window"].append(ct)
        
measures = pd.DataFrame(result)

--------------------
training window 0




[ 01:13:23] getting statistics
--------------------
training window 657




[ 01:14:06] getting statistics
--------------------
training window 1314




[ 01:14:47] getting statistics
--------------------
training window 1971




[ 01:15:25] getting statistics
--------------------
training window 2628




[ 01:16:02] getting statistics
--------------------
training window 3285




[ 01:16:41] getting statistics
--------------------
training window 3942




[ 01:17:19] getting statistics
--------------------
training window 4599




[ 01:18:00] getting statistics
--------------------
training window 5256




[ 01:18:36] getting statistics
--------------------
training window 5913




[ 01:19:14] getting statistics
--------------------
training window 6570




[ 01:19:53] getting statistics
--------------------
training window 7227




[ 01:20:30] getting statistics
--------------------
training window 7884




[ 01:21:04] getting statistics
--------------------
training window 8541




[ 01:21:41] getting statistics
--------------------
training window 9198




[ 01:22:21] getting statistics
--------------------
training window 9855




[ 01:22:59] getting statistics
--------------------
training window 10512




[ 01:23:35] getting statistics
--------------------
training window 11169




[ 01:24:10] getting statistics
--------------------
training window 11826




[ 01:24:44] getting statistics
--------------------
training window 12483




[ 01:25:21] getting statistics
--------------------
training window 13140




[ 01:25:57] getting statistics
--------------------
training window 13797




[ 01:26:32] getting statistics
--------------------
training window 14454




[ 01:27:08] getting statistics
--------------------
training window 15111




[ 01:27:44] getting statistics
--------------------
training window 15768




[ 01:28:21] getting statistics
--------------------
training window 16425




[ 01:28:57] getting statistics
--------------------
training window 17082




[ 01:29:32] getting statistics
--------------------
training window 17739




[ 01:30:08] getting statistics
--------------------
training window 18396




[ 01:30:43] getting statistics
--------------------
training window 19053




[ 01:31:21] getting statistics


In [37]:
measures

Unnamed: 0,window,rmse,mape,smape
0,0,110.407424,171.538154,40.002577
1,657,76.728466,52.091863,19.970043
2,1314,144.760349,104.349065,36.27479
3,1971,78.921956,76.37616,37.351129
4,2628,68.094366,58.125543,26.973079
5,3285,85.784635,139.090408,35.551728
6,3942,160.565957,141.17158,64.481796
7,4599,71.504127,82.806007,42.377377
8,5256,256.511407,378.484733,58.764752
9,5913,59.726929,41.385018,27.596892


### Mean Statistics PCA-ARIMAX

In [38]:
final_result = {
     "rmse": [],
     "mape": [],
     "smape": []
}

final_result["rmse"].append(statistics.mean(measures['rmse']))
final_result["mape"].append(statistics.mean(measures['mape']))
final_result["smape"].append(statistics.mean(measures['smape']))
        
final_measures_pca = pd.DataFrame(final_result)

print("Mean Statistics PCA-ARIMAX (test): ")
final_measures_pca

Mean Statistics PCA-ARIMAX (test): 


Unnamed: 0,rmse,mape,smape
0,101.434976,110.057085,40.627907
