In [6]:
import sys
sys.path.append("/home/hugo/projetos-doutorado/Appliance-Energy-Prediction")
import pandas as pd
from pca_fts.PcaArimax import PcaArimax
from pyFTS.benchmarks import Measures
import matplotlib.pyplot as plt
from pyFTS.common import Util
import datetime
import statistics
import math

## Aux functions

In [2]:
def sample_first_prows(data, perc=0.75):
    return data.head(int(len(data)*(perc)))

In [7]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

## Dataset Energy Appliances

In [8]:
filename = '/home/hugo/projetos-doutorado/Appliance-Energy-Prediction/data/energydata_complete.csv'
data = pd.read_csv(filename)
data.pop('date')
data.pop('rv1')
data.pop('rv2')
data.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint
0,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3
1,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2
2,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1
3,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0
4,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9


In [9]:
values = data.values
values

array([[ 60.        ,  30.        ,  19.89      , ...,   7.        ,
         63.        ,   5.3       ],
       [ 60.        ,  30.        ,  19.89      , ...,   6.66666667,
         59.16666667,   5.2       ],
       [ 50.        ,  30.        ,  19.89      , ...,   6.33333333,
         55.33333333,   5.1       ],
       ...,
       [270.        ,  10.        ,  25.5       , ...,   3.66666667,
         25.33333333,  13.26666667],
       [420.        ,  10.        ,  25.5       , ...,   3.83333333,
         26.16666667,  13.23333333],
       [430.        ,  10.        ,  25.5       , ...,   4.        ,
         27.        ,  13.2       ]])

### Dataset Energy Appliances: columns

In [10]:
cols = data.columns
list_cols = list(cols)
list_cols.append('Appliances_t+1')
print(list_cols)

['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'Appliances_t+1']


## Dataset Energy Appliance: Preprocessing, Lag = 1, t+1

In [11]:
# frame as supervised learning
reframed = series_to_supervised(values, 1, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]], axis=1, inplace=True)
#reframed.drop(reframed.columns[[0,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]], axis=1, inplace=True)
reframed.head()

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var10(t-1),...,var18(t-1),var19(t-1),var20(t-1),var21(t-1),var22(t-1),var23(t-1),var24(t-1),var25(t-1),var26(t-1),var1(t)
1,60.0,30.0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,60.0
2,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,50.0
3,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,50.0
4,50.0,40.0,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,60.0
5,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,50.0


In [12]:
data = pd.DataFrame(reframed.values,columns=list_cols)
data.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,Appliances_t+1
0,60.0,30.0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,60.0
1,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,50.0
2,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,50.0
3,50.0,40.0,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,60.0
4,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,50.0


## Model: PcaArimax - PCA-ARIMAX

### Define ARIMAX parameters 

In [30]:
pca_sarimax = PcaArimax(n_components = 2,
                       endogen_variable = 'Appliances_t+1',
                       order = [3, 1, 2])

In [31]:
reduced = pca_sarimax.apply_pca(data)
reduced.head(2)

Unnamed: 0,x,y,Appliances_t+1
0,-0.728215,0.596265,60.0
1,-0.727195,0.592026,50.0


In [32]:
exog = reduced.drop(labels=['Appliances_t+1'], axis=1)
exog.head(2)

Unnamed: 0,x,y
0,-0.728215,0.596265
1,-0.727195,0.592026


In [33]:
endog = reduced['Appliances_t+1']
endog.head(3)

0    60.0
1    50.0
2    50.0
Name: Appliances_t+1, dtype: float64

In [34]:
# Define arimax paraters using the embedded data 
# from pmdarima.arima import auto_arima
# step_wise=auto_arima(endog, 
#  exogenous= exog,
#  start_p=1, start_q=1, 
#  max_p=3, max_q=3, 
#  d=1, max_d=3,
#  trace=True, 
#  error_action='ignore', 
#  suppress_warnings=True, 
#  stepwise=True)

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=222093.125, Time=37.60 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=224694.705, Time=1.02 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=224620.388, Time=2.12 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=224531.032, Time=3.24 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=224692.715, Time=8.26 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=221930.549, Time=24.39 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=223273.092, Time=3.92 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=221867.373, Time=35.46 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=222897.900, Time=3.95 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=221868.938, Time=49.25 sec
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=221872.197, Time=34.82 sec
 ARIMA(3,1,1)(0,0,0)[0]             : AIC=221865.455, Time=30.54 sec
 ARIMA(2,1,1)(0,0,0)[0]             : AIC=221928.622, Time=19.03 sec
 ARIMA(3,1,0)(0,0,0)[0]             : AIC=222895.922, Time=3.53 se

In [35]:
# Best model:  ARIMA(3,1,1)(0,0,0)[0]          
# Total fit time: 327.538 seconds
step_wise.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,19734.0
Model:,"SARIMAX(3, 1, 1)",Log Likelihood,-110925.728
Date:,"Sat, 19 Jun 2021",AIC,221865.455
Time:,21:57:32,BIC,221920.686
Sample:,0,HQIC,221883.537
,- 19734,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x,-275.2184,18.121,-15.188,0.000,-310.734,-239.703
y,-461.4016,19.961,-23.115,0.000,-500.524,-422.279
ar.L1,0.7001,0.007,104.994,0.000,0.687,0.713
ar.L2,-0.2046,0.004,-46.553,0.000,-0.213,-0.196
ar.L3,0.0685,0.005,12.844,0.000,0.058,0.079
ma.L1,-0.8748,0.006,-148.814,0.000,-0.886,-0.863
sigma2,4467.9016,14.190,314.874,0.000,4440.091,4495.713

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,369259.11
Prob(Q):,0.97,Prob(JB):,0.0
Heteroskedasticity (H):,0.8,Skew:,2.77
Prob(H) (two-sided):,0.0,Kurtosis:,23.46


### Test the model with the best parameters 

In [36]:
pca_arimax = PcaArimax(n_components = 2,
                       endogen_variable = 'Appliances_t+1',
                       order = [3,1,1])

In [37]:
train = sample_first_prows(data,0.75)
test = data.iloc[max(train.index):]
y_test = data.iloc[max(train.index):]['Appliances_t+1'].values

In [38]:
model, sarimax, pca_reduced_train = pca_arimax.run_train_model(train)



In [39]:
sarimax.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,14800.0
Model:,"SARIMAX(3, 1, 1)",Log Likelihood,-83572.613
Date:,"Sat, 19 Jun 2021",AIC,167207.226
Time:,22:01:47,BIC,167442.891
Sample:,0,HQIC,167285.469
,- 14800,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Appliances,-0.2132,0.022,-9.915,0.000,-0.255,-0.171
lights,0.3067,0.069,4.450,0.000,0.172,0.442
T1,-7.4136,8.490,-0.873,0.383,-24.053,9.226
RH_1,-15.6893,0.811,-19.350,0.000,-17.278,-14.100
T2,20.1966,5.236,3.857,0.000,9.935,30.459
RH_2,8.8978,1.553,5.729,0.000,5.854,11.942
T3,-77.1862,6.230,-12.390,0.000,-89.396,-64.976
RH_3,-9.2423,2.177,-4.246,0.000,-13.509,-4.976
T4,-44.3807,4.431,-10.016,0.000,-53.066,-35.696

0,1,2,3
Ljung-Box (L1) (Q):,0.09,Jarque-Bera (JB):,272325.9
Prob(Q):,0.76,Prob(JB):,0.0
Heteroskedasticity (H):,0.85,Skew:,2.82
Prob(H) (two-sided):,0.0,Kurtosis:,23.24


In [40]:
pca_reduced_train.head(2)

Unnamed: 0,x,y,Appliances_t+1
0,-0.795656,0.394111,60.0
1,-0.793946,0.386976,50.0


In [41]:
start = len(train)
end = len(train) + len(test) -1
forecast, pca_reduced_test = pca_arimax.run_test_model(test, sarimax,start,end)

In [50]:
list(forecast)

[347.14880966599685,
 432.5245459851285,
 493.3749876625034,
 540.088801361544,
 554.1734454628149,
 418.89578911880017,
 436.53287165790334,
 454.293066926376,
 384.7606944921481,
 370.90313416995104,
 369.27607204315154,
 376.7610815195303,
 432.7752260691195,
 460.2624265533059,
 485.4954812593096,
 494.7889508344024,
 541.3038392260896,
 575.589735864919,
 595.0641483258487,
 606.9450414437015,
 607.6726880397637,
 633.5626167245791,
 639.7346253494279,
 638.4473534497729,
 637.1115090028061,
 633.3406575755171,
 637.4068213009032,
 648.6159123162734,
 649.6630997180582,
 660.5303686773441,
 662.8449987669637,
 670.4215698560029,
 627.1252618218177,
 617.8016403764277,
 534.6230248880665,
 489.59088592692933,
 574.643248661605,
 569.5550376989586,
 476.83935277367436,
 484.15953609209464,
 501.18299335386837,
 569.6282556617552,
 613.4074507704217,
 619.8832936135286,
 658.4681024704132,
 677.8414786523754,
 659.0678427824741,
 661.950463098884,
 666.0099339476883,
 622.83641505594

In [44]:
pca_reduced_test['Appliances_t+1'].head(2)

0    280.0
1    280.0
Name: Appliances_t+1, dtype: float64

In [45]:
print(Measures.rmse(pca_reduced_test['Appliances_t+1'],forecast.values))
print(Measures.mape(pca_reduced_test['Appliances_t+1'],forecast.values))

272.77573136060977
322.62767298873143


## Model: PcaArimax - PCA-ARIMAX - Final model 

In [46]:
pca_sarimax = PcaArimax(n_components = 2,
                       endogen_variable = 'Appliances',
                       order=[3,1,1])

## Sliding Window PyFTS: PCA-ARIMAX

In [47]:
result = {
     "window": [],
     "rmse": [],
     "mape": [],
     "smape": []
}

tam = len(data)
n_windows = 30
windows_length = math.floor(tam / n_windows)
for ct, ttrain, ttest in Util.sliding_window(data, windows_length, 0.75, inc=1):
    if len(ttest) > 0:
        
        start = len(ttrain)
        end = len(ttrain)+ len(ttest) -1
        appliance = ttest['Appliances_t+1'].reset_index()
        
        print('-' * 20)
        print(f'training window {(ct)}')
        model, sarimax, pca_reduced_train = pca_sarimax.run_train_model(ttrain)
        forecast, pca_reduced_test = pca_sarimax.run_test_model(ttest,sarimax,start,end)
        
        print("[{0: %H:%M:%S}]".format(datetime.datetime.now()) + f" getting statistics")
        rmse = Measures.rmse(list(appliance.iloc[:,1]),list(forecast))
        mape = Measures.mape(list(appliance.iloc[:,1]),list(forecast))
        smape = Measures.smape(list(appliance.iloc[:,1]),list(forecast))
        
        result["rmse"].append(rmse)
        result["mape"].append(mape)
        result["smape"].append(smape)
        result["window"].append(ct)
        
measures = pd.DataFrame(result)

--------------------
training window 0




[ 22:02:53] getting statistics
--------------------
training window 657




[ 22:02:59] getting statistics
--------------------
training window 1314




[ 22:03:06] getting statistics
--------------------
training window 1971




[ 22:03:13] getting statistics
--------------------
training window 2628




[ 22:03:19] getting statistics
--------------------
training window 3285




[ 22:03:26] getting statistics
--------------------
training window 3942




[ 22:03:32] getting statistics
--------------------
training window 4599




[ 22:03:39] getting statistics
--------------------
training window 5256




[ 22:03:45] getting statistics
--------------------
training window 5913




[ 22:03:53] getting statistics
--------------------
training window 6570




[ 22:04:00] getting statistics
--------------------
training window 7227




[ 22:04:05] getting statistics
--------------------
training window 7884




[ 22:04:12] getting statistics
--------------------
training window 8541




[ 22:04:19] getting statistics
--------------------
training window 9198




[ 22:04:26] getting statistics
--------------------
training window 9855




[ 22:04:33] getting statistics
--------------------
training window 10512




[ 22:04:39] getting statistics
--------------------
training window 11169




[ 22:04:46] getting statistics
--------------------
training window 11826




[ 22:04:53] getting statistics
--------------------
training window 12483




[ 22:04:59] getting statistics
--------------------
training window 13140




[ 22:05:05] getting statistics
--------------------
training window 13797




[ 22:05:12] getting statistics
--------------------
training window 14454




[ 22:05:18] getting statistics
--------------------
training window 15111




[ 22:05:24] getting statistics
--------------------
training window 15768




[ 22:05:31] getting statistics
--------------------
training window 16425




[ 22:05:37] getting statistics
--------------------
training window 17082




[ 22:05:46] getting statistics
--------------------
training window 17739




[ 22:05:54] getting statistics
--------------------
training window 18396




[ 22:06:02] getting statistics
--------------------
training window 19053




[ 22:06:10] getting statistics


In [48]:
measures

Unnamed: 0,window,rmse,mape,smape
0,0,425.641545,453.795045,85.02587
1,657,877.006386,1165.350289,72.275187
2,1314,766.906447,994.003002,95.569425
3,1971,246.428125,648.741824,76.866543
4,2628,139.31538,134.078713,42.662548
5,3285,175.42357,242.698241,57.166792
6,3942,220.925024,253.061682,50.848908
7,4599,130.468043,144.876919,42.082949
8,5256,208.28201,313.907711,63.585635
9,5913,217.284225,107.330246,69.553473


### Mean Statistics PCA-ARIMAX

In [49]:
final_result = {
     "rmse": [],
     "mape": [],
     "smape": []
}

final_result["rmse"].append(statistics.mean(measures['rmse']))
final_result["mape"].append(statistics.mean(measures['mape']))
final_result["smape"].append(statistics.mean(measures['smape']))
        
final_measures_pca = pd.DataFrame(final_result)

print("Mean Statistics PCA-ARIMAX (test): ")
final_measures_pca

Mean Statistics PCA-ARIMAX (test): 


Unnamed: 0,rmse,mape,smape
0,349.911223,448.5939,69.179843
