In [1]:
import sys
sys.path.append("/home/hugo/projetos-doutorado/Appliance-Energy-Prediction")
import pandas as pd
from pca_fts.PcaWeightedMVFTS import PcaWeightedMVFTS
from pyFTS.benchmarks import Measures
import matplotlib.pyplot as plt
from pyFTS.common import Util
import datetime
import statistics
import math
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

## Aux functions

In [2]:
def sample_first_prows(data, perc=0.75):
    return data.head(int(len(data)*(perc)))

In [3]:
# convert series to supervised learning
def series_to_supervised_miso(data, n_in, n_out, endog_var='Global_active_power', dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [(df.columns[j]+'(t-%d)' % (i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df[endog_var].shift(-i))
		if i == 0:
			names += [(endog_var+'(t)')]
		else:
			names += [(endog_var+'(t+%d)' % (i))]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [4]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame) 
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

## Dataset Household

In [5]:
filepath = '/home/hugo/projetos-doutorado/Appliance-Energy-Prediction/data/household_power_consumption.csv'
data = pd.read_csv(filepath, sep = ";")
data = data.drop(labels=['Time','Date'], axis=1)
data = data.loc[0: : 30] # 30 minutes
data.dropna(inplace = True)
data = clean_dataset(data)
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0
30,2.72,0.0,235.06,11.6,0.0,0.0,17.0
60,3.452,0.0,235.2,15.2,0.0,1.0,17.0
90,4.298,0.0,232.39,18.4,0.0,1.0,16.0
120,3.262,0.052,232.64,14.0,0.0,0.0,17.0


In [6]:
data['Global_active_power'] = pd.to_numeric(data['Global_active_power'],errors='coerce')
data['Global_reactive_power'] = pd.to_numeric(data['Global_reactive_power'],errors='coerce')
data['Voltage'] = pd.to_numeric(data['Voltage'],errors='coerce')
data['Global_intensity'] = pd.to_numeric(data['Global_intensity'],errors='coerce')
data['Sub_metering_1'] = pd.to_numeric(data['Sub_metering_1'],errors='coerce')
data['Sub_metering_2'] = pd.to_numeric(data['Sub_metering_2'],errors='coerce')
data['Sub_metering_3'] = pd.to_numeric(data['Sub_metering_3'],errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68308 entries, 0 to 2075250
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    68308 non-null  float64
 1   Global_reactive_power  68308 non-null  float64
 2   Voltage                68308 non-null  float64
 3   Global_intensity       68308 non-null  float64
 4   Sub_metering_1         68308 non-null  float64
 5   Sub_metering_2         68308 non-null  float64
 6   Sub_metering_3         68308 non-null  float64
dtypes: float64(7)
memory usage: 4.2 MB


In [7]:
round(data.describe(),3)

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,68308.0,68308.0,68308.0,68308.0,68308.0,68308.0,68308.0
mean,1.092,0.124,240.853,4.629,1.132,1.293,6.459
std,1.059,0.113,3.219,4.451,6.211,5.785,8.428
min,0.078,0.0,225.32,0.2,0.0,0.0,0.0
25%,0.308,0.048,239.05,1.4,0.0,0.0,0.0
50%,0.6,0.1,241.03,2.6,0.0,0.0,1.0
75%,1.53,0.194,242.87,6.4,0.0,1.0,17.0
max,10.29,1.094,253.36,44.6,87.0,79.0,31.0


### Dataset Household: Preprocessing, Lag = 1, t+1

In [8]:
# frame as supervised learning
data = series_to_supervised_miso(data, 1, 1,endog_var='Global_active_power')
data.head()

Unnamed: 0,Global_active_power(t-1),Global_reactive_power(t-1),Voltage(t-1),Global_intensity(t-1),Sub_metering_1(t-1),Sub_metering_2(t-1),Sub_metering_3(t-1),Global_active_power(t)
30,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2.72
60,2.72,0.0,235.06,11.6,0.0,0.0,17.0,3.452
90,3.452,0.0,235.2,15.2,0.0,1.0,17.0,4.298
120,4.298,0.0,232.39,18.4,0.0,1.0,16.0,3.262
150,3.262,0.052,232.64,14.0,0.0,0.0,17.0,3.214


In [26]:
data

Unnamed: 0,Global_active_power(t-1),Global_reactive_power(t-1),Voltage(t-1),Global_intensity(t-1),Sub_metering_1(t-1),Sub_metering_2(t-1),Sub_metering_3(t-1),Global_active_power(t)
30,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2.720
60,2.720,0.000,235.06,11.6,0.0,0.0,17.0,3.452
90,3.452,0.000,235.20,15.2,0.0,1.0,17.0,4.298
120,4.298,0.000,232.39,18.4,0.0,1.0,16.0,3.262
150,3.262,0.052,232.64,14.0,0.0,0.0,17.0,3.214
...,...,...,...,...,...,...,...,...
2075130,1.716,0.128,238.42,7.2,0.0,0.0,0.0,1.556
2075160,1.556,0.064,237.56,6.6,0.0,0.0,0.0,1.120
2075190,1.120,0.082,238.91,4.6,0.0,0.0,0.0,1.468
2075220,1.468,0.134,238.44,6.2,0.0,0.0,0.0,1.082


## Model: PcaWeightedMVFTS - PCA-MVFTS

In [9]:
pca_wmvfts = PcaWeightedMVFTS(n_components = 2,
                       endogen_variable = 'Global_active_power(t-1)',
                       n_part = 50)

In [10]:
def sliding_window_pca(data,n_windows,train_size,dim,fs):

    result = {
         "window": [],
         "rmse": [],
         "mae": [],
         "mape": [],
         "smape": []
    }

    tam = len(data)
    n_windows = 30
    windows_length = math.floor(tam / n_windows)
    for ct, ttrain, ttest in Util.sliding_window(data, windows_length, 0.75, inc=1):
        if len(ttest) > 0:

            pca_wmvfts = PcaWeightedMVFTS(n_components = dim,
                       endogen_variable = 'Global_active_power(t-1)',
                       n_part = fs)
            
            data_train = ttrain.loc[:,'Global_active_power(t-1)':'Sub_metering_3(t-1)']
            data_test = ttest.loc[:,'Global_reactive_power(t-1)':'Global_active_power(t-1)']
        
            #ALtera o nome da columa porque o modelo utiliza esse label como variável endogena 
            data_test.rename(columns = {'Global_active_power(t)': 'Global_active_power(t-1)'}, inplace = True)
            
            model, pca_reduced_train = pca_wmvfts.run_train_model(ttrain)
            forecast, forecast_self, pca_reduced_test = pca_wmvfts.run_test_model(model,ttest)
            
            #Teste e cálculo dos erros da previsão  
            #valor use Global_reactive_power(t-1) é o Global_reactive_power(t) --> label igual por causa do modelo 
            rmse = Measures.rmse(pca_reduced_test['Global_active_power(t-1)'],forecast)
            mape = Measures.mape(pca_reduced_test['Global_active_power(t-1)'],forecast)
            smape = Measures.smape(pca_reduced_test['Global_active_power(t-1)'],forecast)
            
            forecast = pd.DataFrame(forecast)
            forecast.fillna(forecast.mean(),inplace=True)
            forecast = np.array(forecast).reshape(-1)
            mae = mean_absolute_error(pca_reduced_test['Global_active_power(t-1)'], forecast)
            
            result["rmse"].append(round(rmse,3))
            result["mape"].append(round(mape,3))
            result["smape"].append(round(smape,3))
            result["mae"].append(round(mae,3))
            result["window"].append(ct)
        
    measures = pd.DataFrame(result)
    return measures



In [11]:
dimensions = [2,3,4]
fuzzy_sets = [10,20,30,40,50]

final_result = {
    "dimensions": [],
    "fuzzy_sets": [],
    "rmse": [],
    "mae": [],
    "mape": [],
    "smape": []
}

n_windows = 30
train_size = 0.75

for dim in dimensions:
    for fs in fuzzy_sets:
        measures = sliding_window_pca(data,n_windows,train_size,dim,fs)
        final_result["dimensions"].append(dim)
        final_result["fuzzy_sets"].append(fs)
        
        rmse = round(statistics.mean(measures['rmse']),3)
        mape = round(statistics.mean(measures['mape']),3)
        smape = round(statistics.mean(measures['smape']),3)
        mae = round(statistics.mean(measures['mae']),3)
        
        final_result["rmse"].append(rmse)
        final_result["mape"].append(mape)
        final_result["smape"].append(smape)
        final_result["mae"].append(mae)
        
        print(f'Results: {(dim,fs,rmse,mae,mape,smape)}')
        
        
final_measures_pca = pd.DataFrame(final_result) 

print("Statistics PCA-WMVFTS (test): ")
final_measures_pca

Results: (2, 10, 0.549, 0.418, 88.727, 24.827)
Results: (2, 20, 0.511, 0.355, 56.347, 17.858)
Results: (2, 30, 0.469, 0.297, 51.887, 15.694)
Results: (2, 40, 0.41, 0.245, 43.124, 13.525)
Results: (2, 50, 0.366, 0.204, 37.303, 11.529)
Results: (3, 10, 0.517, 0.374, 78.947, 22.804)
Results: (3, 20, 0.368, 0.24, 42.782, 13.813)
Results: (3, 30, 0.276, 0.155, 31.755, 10.163)
Results: (3, 40, 0.204, 0.108, 22.335, 7.564)
Results: (3, 50, 0.162, 0.079, 17.528, 5.807)
Results: (4, 10, 0.421, 0.309, 70.581, 20.532)
Results: (4, 20, 0.236, 0.147, 28.93, 9.689)
Results: (4, 30, 0.138, 0.071, 15.501, 5.438)
Results: (4, 40, 0.084, 0.039, 8.63, 3.147)
Results: (4, 50, 0.061, 0.023, 5.24, 1.941)
Statistics PCA-WMVFTS (test): 


Unnamed: 0,dimensions,fuzzy_sets,rmse,mae,mape,smape
0,2,10,0.549,0.418,88.727,24.827
1,2,20,0.511,0.355,56.347,17.858
2,2,30,0.469,0.297,51.887,15.694
3,2,40,0.41,0.245,43.124,13.525
4,2,50,0.366,0.204,37.303,11.529
5,3,10,0.517,0.374,78.947,22.804
6,3,20,0.368,0.24,42.782,13.813
7,3,30,0.276,0.155,31.755,10.163
8,3,40,0.204,0.108,22.335,7.564
9,3,50,0.162,0.079,17.528,5.807


In [13]:
final_measures_pca

Unnamed: 0,dimensions,fuzzy_sets,rmse,mae,mape,smape
0,2,10,0.549,0.418,88.727,24.827
1,2,20,0.511,0.355,56.347,17.858
2,2,30,0.469,0.297,51.887,15.694
3,2,40,0.41,0.245,43.124,13.525
4,2,50,0.366,0.204,37.303,11.529
5,3,10,0.517,0.374,78.947,22.804
6,3,20,0.368,0.24,42.782,13.813
7,3,30,0.276,0.155,31.755,10.163
8,3,40,0.204,0.108,22.335,7.564
9,3,50,0.162,0.079,17.528,5.807


In [12]:
final_measures_pca.to_csv (r'pca_wmvfts_dim_fsets_hpc.csv', index = False, header=True)