In [1]:
import sys
sys.path.append("/home/hugo/projetos-doutorado/Appliance-Energy-Prediction")
import pandas as pd
from pca_fts.PcaSarimax import PcaSarimax
from pyFTS.benchmarks import Measures
import matplotlib.pyplot as plt
from pyFTS.common import Util
import datetime
import statistics
import math
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## Aux functions

In [2]:
def sample_first_prows(data, perc=0.75):
    return data.head(int(len(data)*(perc)))

In [3]:
# convert series to supervised learning
def series_to_supervised_miso(data, n_in, n_out, endog_var, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [(df.columns[j]+'(t-%d)' % (i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df[endog_var].shift(-i))
		if i == 0:
			names += [(endog_var+'(t)')]
		else:
			names += [(endog_var+'(t+%d)' % (i))]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [4]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame) 
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [5]:
def cal_nrmse(rmse, y):
    x = max(y)-min(y)
    return (rmse/x)

## Dataset Household Power Consumption

In [16]:
filepath = '/home/hugo/projetos-doutorado/Appliance-Energy-Prediction/data/household_power_consumption.csv'
data = pd.read_csv(filepath, sep = ";")
data = data.drop(labels=['Time','Date'], axis=1)
#data = data.loc[0: : 30] # 30 minutes
data.dropna(inplace = True)
data = clean_dataset(data)
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0


## Dataset Household Power Consumption, Lag = 1, t+1

In [17]:
data = series_to_supervised_miso(data, 1, 1,endog_var='Global_active_power')
data.head()

Unnamed: 0,Global_active_power(t-1),Global_reactive_power(t-1),Voltage(t-1),Global_intensity(t-1),Sub_metering_1(t-1),Sub_metering_2(t-1),Sub_metering_3(t-1),Global_active_power(t)
1,4.216,0.418,234.84,18.4,0.0,1.0,17.0,5.36
2,5.36,0.436,233.63,23.0,0.0,1.0,16.0,5.374
3,5.374,0.498,233.29,23.0,0.0,2.0,17.0,5.388
4,5.388,0.502,233.74,23.0,0.0,1.0,17.0,3.666
5,3.666,0.528,235.68,15.8,0.0,1.0,17.0,3.52


## Model: PcaSarimax - PCA-SARIMAX

### Define SARIMAX parameters 

In [8]:
pca_sarimax = PcaSarimax(n_components = 2,
                       endogen_variable = 'Global_active_power(t-1)',
                       order = [0, 0, 0],
                       seasonal_order = [0,0,0,0])

In [9]:
df_train = data.loc[:,'Global_active_power(t-1)':'Sub_metering_3(t-1)']
train = sample_first_prows(data, perc=0.30)

In [10]:
reduced = pca_sarimax.apply_pca(train)
reduced.head(2)

Unnamed: 0,C0,C1,Global_active_power(t-1)
0,0.651087,0.191734,4.216
1,0.633207,0.144267,2.72


In [11]:
exog = reduced.drop(labels=['Global_active_power(t-1)'], axis=1)
exog.head(2)

Unnamed: 0,C0,C1
0,0.651087,0.191734
1,0.633207,0.144267


In [12]:
endog = reduced['Global_active_power(t-1)']
endog.head(3)

0    4.216
1    2.720
2    3.452
Name: Global_active_power(t-1), dtype: float64

In [13]:
train_y = endog
train_X = exog

In [14]:
# Define arimax paraters using the embedded data 
# from pmdarima.arima import auto_arima
# sarimax_model = auto_arima(train_y,
#                        exogenous= train_X,
#                        test='adf',
#                        start_p=0,
#                        start_q=0,
#                        max_p=5,
#                        max_q=5,
#                        start_Q=0,
#                        start_P=0,
#                        max_P=5,
#                        max_Q=5,
#                        m=7,
#                        seasonal=True,
#                        d=None,
#                        D=1,
#                        max_D=5,
#                        trace=True,
#                        error_action='ignore',
#                        suppress_warnings=True,
#                        stepwise=True)
# #                        n_fits = 50)

In [15]:
#  ARIMA(0,0,0)(0,1,0)[7] intercept   : AIC=40774.955, Time=6.39 sec
#  ARIMA(1,0,0)(1,1,0)[7] intercept   : AIC=33919.774, Time=26.49 sec
#  ARIMA(0,0,1)(0,1,1)[7] intercept   : AIC=inf, Time=57.71 sec
#  ARIMA(0,0,0)(0,1,0)[7]             : AIC=40772.955, Time=11.11 sec
#  ARIMA(1,0,0)(0,1,0)[7] intercept   : AIC=39425.191, Time=7.92 sec
#  ARIMA(1,0,0)(2,1,0)[7] intercept   : AIC=32398.909, Time=46.83 sec
#  ARIMA(1,0,0)(3,1,0)[7] intercept   : AIC=31218.484, Time=64.64 sec
#  ARIMA(1,0,0)(4,1,0)[7] intercept   : AIC=30213.416, Time=96.16 sec
#  ARIMA(1,0,0)(5,1,0)[7] intercept   : AIC=29495.805, Time=161.14 sec

### Test the model with the best parameters 

In [16]:
# pca_arimax = PcaSarimax(n_components = 2,
#                        endogen_variable = 'Global_active_power(t-1)',
#                        order=[4,0,0],
#                        seasonal_order=[5,1,0,7])

In [26]:
# df_train = data.loc[:,'Global_active_power(t-1)':'Sub_metering_3(t-1)']
# df_test = data.loc[:,'Global_reactive_power(t-1)':'Global_active_power(t)']
# train = sample_first_prows(df_train,0.75)
# test = df_test.iloc[max(train.index):]

In [9]:
# model, sarimax, pca_reduced_train = pca_arimax.run_train_model(train)

In [10]:
# sarimax.summary()

In [11]:
# pca_reduced_train.head(2)

In [12]:
#ALtera o nome da columa porque o modelo utiliza esse label como variável endogena 
# test.rename(columns = {'use [kW](t)': 'use [kW](t-1)'}, inplace = True)
# start = len(train)
# end = len(train) + len(test) -1
# forecast, pca_reduced_test = pca_arimax.run_test_model(test, sarimax,start,end)

In [13]:
# forecast

In [14]:
#pca_reduced_test['Appliances(t-1)'].head(2)
# pca_reduced_test.head(5)

In [15]:
# print(Measures.rmse(pca_reduced_test['use [kW](t-1)'],forecast))
# print(Measures.mape(pca_reduced_test['use [kW](t-1)'],forecast))

## Model: PcaSarimax - PCA-SARIMAX - Final model 

In [41]:
# ARIMA(1,0,0)(5,1,0)[7] ==> 0.901978	0.519373	65.378872	23.312508 
# ARIMA(1,0,0)(1,1,0)[7] ==> 0.901978	0.519373	65.378872	23.312508
# ARIMA(1,0,1)(1,1,1)[7] ==> 0.901098	0.517399	64.944311	22.93125
# ARIMA(1,0,0)(1,1,1)[7] ==> 0.901256	0.517574	64.99313	22.962299
# ARIMA(1,0,1)(1,1,0)[7] ==> 0.901879	0.519276	65.342213	23.30976
# ARIMA(1,0,1)(2,1,0)[7]==> 0.901052	0.517548	64.931938	22.989602

In [18]:
pca_sarimax = PcaSarimax(n_components = 3,
                       endogen_variable = 'Global_active_power(t-1)',
                       order=[1,0,1],
                       seasonal_order=[2,1,1,7])

## Sliding Window PyFTS: PCA-SARIMAX

In [19]:
result = {
     "window": [],
     "rmse": [],
     "mae": [],
     "mape": [],
     "smape": [],
     "nrmse":[]
}

tam = len(data)
n_windows = 30
windows_length = math.floor(tam / n_windows)
for ct, ttrain, ttest in Util.sliding_window(data, windows_length, 0.75, inc=1):
    if len(ttest) > 0:

        data_train = ttrain.loc[:,'Global_active_power(t-1)':'Sub_metering_3(t-1)']
        data_test = ttest.loc[:,'Global_reactive_power(t-1)':'Global_active_power(t)']
        
        #ALtera o nome da columa porque o modelo utiliza esse label como variável endogena 
        data_test.rename(columns = {'Global_active_power(t)': 'Global_active_power(t-1)'}, inplace = True)
        
        start = len(data_train)
        end = len(data_train)+ len(data_test) -1
        
        print('-' * 20)
        print(f'training window {(ct)}')
        model, sarimax, pca_reduced_train = pca_sarimax.run_train_model(data_train)
        forecast, pca_reduced_test = pca_sarimax.run_test_model(data_test,sarimax,start,end)
        
        print("[{0: %H:%M:%S}]".format(datetime.datetime.now()) + f" getting statistics")
        rmse = Measures.rmse(pca_reduced_test['Global_active_power(t-1)'],forecast)
        mape = Measures.mape(pca_reduced_test['Global_active_power(t-1)'],forecast)
        smape = Measures.smape(pca_reduced_test['Global_active_power(t-1)'],forecast)
        
#         forecast = pd.DataFrame(forecast)
#         forecast.fillna(forecast.mean(),inplace=True)
#         forecast = np.array(forecast).reshape(-1)
        mae = mean_absolute_error(pca_reduced_test['Global_active_power(t-1)'], forecast)
    
        # Grafico de comparação Original x Previsto para cada janela
#         fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[15, 3])
#         ax.plot(pca_reduced_test['Global_active_power(t-1)'], label='Original')
#         ax.plot(forecast, label='Forecast')
#         handles, labels = ax.get_legend_handles_labels()
#         lgd = ax.legend(handles, labels, loc=2, bbox_to_anchor=(1, 1))
#         plt.show()
        
        nrmse = cal_nrmse(rmse, pca_reduced_test['Global_active_power(t-1)'])
                
        result["rmse"].append(round(rmse,3))
        result["mae"].append(round(mae,3))
        result["mape"].append(round(mape,3))
        result["smape"].append(round(smape,3))
        result["nrmse"].append(round(nrmse,3))
        result["window"].append(ct)
        
measures = pd.DataFrame(result)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


--------------------
training window 0




[ 10:52:01] getting statistics
--------------------
training window 68309


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 10:57:55] getting statistics
--------------------
training window 136618


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:04:37] getting statistics
--------------------
training window 204927


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:11:25] getting statistics
--------------------
training window 273236


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:17:38] getting statistics
--------------------
training window 341545


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:23:49] getting statistics
--------------------
training window 409854


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:30:21] getting statistics
--------------------
training window 478163


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:35:09] getting statistics
--------------------
training window 546472


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:40:00] getting statistics
--------------------
training window 614781


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 11:44:48] getting statistics
--------------------
training window 683090


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:07:27] getting statistics
--------------------
training window 751399


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:13:05] getting statistics
--------------------
training window 819708


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:18:11] getting statistics
--------------------
training window 888017


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:23:23] getting statistics
--------------------
training window 956326


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:31:12] getting statistics
--------------------
training window 1024635


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:36:41] getting statistics
--------------------
training window 1092944


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:41:57] getting statistics
--------------------
training window 1161253


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:47:12] getting statistics
--------------------
training window 1229562


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:52:17] getting statistics
--------------------
training window 1297871


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 12:57:51] getting statistics
--------------------
training window 1366180


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:03:22] getting statistics
--------------------
training window 1434489


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:08:23] getting statistics
--------------------
training window 1502798


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:13:22] getting statistics
--------------------
training window 1571107


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:18:39] getting statistics
--------------------
training window 1639416


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:23:43] getting statistics
--------------------
training window 1707725


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:29:03] getting statistics
--------------------
training window 1776034


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:34:41] getting statistics
--------------------
training window 1844343


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:40:44] getting statistics
--------------------
training window 1912652


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:46:31] getting statistics
--------------------
training window 1980961


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


[ 13:53:13] getting statistics


In [20]:
measures

Unnamed: 0,window,rmse,mae,mape,smape,nrmse
0,0,0.394,0.172,16.528,6.998,0.05
1,68309,0.323,0.119,8.533,3.983,0.041
2,136618,0.327,0.117,14.122,5.955,0.043
3,204927,0.265,0.093,15.215,6.619,0.037
4,273236,0.198,0.065,11.172,5.297,0.027
5,341545,0.283,0.101,11.32,5.283,0.038
6,409854,0.294,0.108,8.339,3.857,0.032
7,478163,0.33,0.129,8.473,3.87,0.035
8,546472,0.256,0.095,8.453,3.936,0.038
9,614781,0.289,0.102,8.795,4.078,0.043


In [11]:
#measures.to_csv (r'win_pca_sarimax_hpc_30min.csv', index = False, header=True)

In [21]:
measures.to_csv (r'win_pca_sarimax_hpc_1min.csv', index = False, header=True)

In [22]:
### Mean Statistics PCA-SARIMAX

In [27]:
final_result = {
     "rmse": [],
     "mae": [],
     "mape": [],
     "smape": [],
     "nrmse":[],
     "rmse_std": [],
     "mae_std": [],
     "mape_std": [],
     "smape_std": [],
     "nrmse_std": []
}

final_result["rmse"].append(round(statistics.mean(measures['rmse']),3))
final_result["mape"].append(round(statistics.mean(measures['mape']),3))
final_result["smape"].append(round(statistics.mean(measures['smape']),3))
final_result["mae"].append(round(statistics.mean(measures['mae']),3))
final_result["nrmse"].append(round(statistics.mean(measures['nrmse']),3))

final_result["rmse_std"].append(round(statistics.stdev(measures['rmse']),3))
final_result["mape_std"].append(round(statistics.stdev(measures['mape']),3))
final_result["smape_std"].append(round(statistics.stdev(measures['smape']),3))
final_result["mae_std"].append(round(statistics.stdev(measures['mae']),3))
final_result["nrmse_std"].append(round(statistics.stdev(measures['nrmse']),3))
        
final_measures_pca = pd.DataFrame(final_result)

print("Mean Statistics PCA-SARIMAX (test): ")
final_measures_pca

Mean Statistics PCA-SARIMAX (test): 


Unnamed: 0,rmse,mae,mape,smape,nrmse,rmse_std,mae_std,mape_std,smape_std,nrmse_std
0,0.258,0.098,11.068,4.982,0.037,0.064,0.025,3.449,1.505,0.01


In [24]:
#final_measures_pca.to_csv (r'pca_sarimax_hpc_30mim.csv', index = False, header=True)

In [26]:
final_measures_pca.to_csv (r'pca_sarimax_hpc_1mim.csv', index = False, header=True)