In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics import mean_squared_error
from math import sqrt

In [18]:
train = pd.read_csv("../data/schemas/warm_up/TrainSet.csv", sep=",")
validation = pd.read_csv("../data/schemas/warm_up/ValidationSet.csv", sep=",")
test = pd.read_csv("../data/schemas/warm_up/TestSet.csv", sep=",")
carbon_mlp = pd.read_csv("../data/results/carbon_mlp.csv", sep=",", header=None)
carbon_lstm = pd.read_csv("../data/results/carbon_lstm.csv", sep=",", header=None)
carbon_gru = pd.read_csv("../data/results/carbon_gru.csv", sep=",", header=None)
solar_mlp = pd.read_csv("../data/results/solar_mlp.csv", sep=",", header=None)
solar_lstm = pd.read_csv("../data/results/solar_lstm.csv", sep=",", header=None)
solar_gru = pd.read_csv("../data/results/solar_gru.csv", sep=",", header=None)

In [19]:
# Global variables

NUM_OF_TIMESTEPS_INPUT = 48
NUM_OF_TIMESTEPS_OUTPUT = 24

THRESHOLD = 0.4  # For feature selection
columns_to_predict = ["kg_CO2/kWh", "Avg solar generation"]

independent_variables = []

for column in train:
    if abs(train[column].corr(train[columns_to_predict[
        0]])) > THRESHOLD:  #or abs(train[column].corr(train[columns_to_predict[1]])) > THRESHOLD:
        independent_variables.append(column)

independent_variables = [var for var in independent_variables if var not in columns_to_predict]

if "Index" in independent_variables:
    independent_variables.remove("Index")
if "Solar Generation (W/kW)_1" in independent_variables:
    independent_variables.remove("Solar Generation (W/kW)_1")
if "Solar Generation (W/kW)_2" in independent_variables:
    independent_variables.remove("Solar Generation (W/kW)_2")
if "Solar Generation (W/kW)_3" in independent_variables:
    independent_variables.remove("Solar Generation (W/kW)_3")
if "Hour_2" in independent_variables:
    independent_variables.remove("Hour_2")
if "Hour_3" in independent_variables:
    independent_variables.remove("Hour_3")

for i in independent_variables:
    print(f"{i}, corr: {'%.3f' % abs(train[i].corr(train[columns_to_predict[0]]))}")


# Split the X and Y for all sets

# Train set
X_train_default = train[independent_variables]
Y_train_default = train[columns_to_predict]

# Validation set, also include the data from train that was used only as output to get more datapoints
X_val_default = pd.concat([X_train_default.tail(NUM_OF_TIMESTEPS_OUTPUT), validation[independent_variables]],
                          ignore_index=True)
Y_val_default = pd.concat([Y_train_default.tail(NUM_OF_TIMESTEPS_OUTPUT), validation[columns_to_predict]],
                          ignore_index=True)

# Test set, also include the data from train that was used only as output to get more datapoints
X_test_default = pd.concat([X_val_default.tail(NUM_OF_TIMESTEPS_OUTPUT), test[independent_variables]],
                           ignore_index=True)
Y_test_default = pd.concat([Y_val_default.tail(NUM_OF_TIMESTEPS_OUTPUT), test[columns_to_predict]], ignore_index=True)

NUM_OF_ROWS_TRAIN, NUM_OF_FEATURES = X_train_default.shape

print(X_train_default.shape)
print(X_val_default.shape)
print(X_test_default.shape)


# Function to prepare the data into batches that will be passed into the model

def create_sequences(input_data, output_data, timesteps_input, timesteps_output):
    sequences, targets = [], []
    for i in range(len(input_data) - timesteps_input - timesteps_output + 1):
        seq = input_data[i:i + timesteps_input]
        target = output_data[i + timesteps_input: i + timesteps_input + timesteps_output]
        sequences.append(seq)
        targets.append(target)

    return np.array(sequences), np.array(targets)


X_train, Y_train = create_sequences(X_train_default, Y_train_default, NUM_OF_TIMESTEPS_INPUT, NUM_OF_TIMESTEPS_OUTPUT)
X_val, Y_val = create_sequences(X_val_default, Y_val_default, NUM_OF_TIMESTEPS_INPUT, NUM_OF_TIMESTEPS_OUTPUT)
X_test, Y_test = create_sequences(X_test_default, Y_test_default, NUM_OF_TIMESTEPS_INPUT, NUM_OF_TIMESTEPS_OUTPUT)

print(f"X_train = {X_train.shape}, Y_train = {Y_train.shape}\n"
      f"X_val = {X_val.shape}, Y_val = {Y_val.shape}\n"
      f"X_test = {X_test.shape}, Y_test = {Y_test.shape}")

Hour_1, corr: 0.432
Occupant Count (people)_3, corr: 0.458
12h Outdoor Drybulb Temperature (C), corr: 0.605
24h Outdoor Drybulb Temperature (C), corr: 0.401
12h Outdoor Relative Humidity (%), corr: 0.525
6h Direct Solar Radiation (W/m2), corr: 0.532
12h Direct Solar Radiation (W/m2), corr: 0.415
(576, 7)
(96, 7)
(96, 7)
X_train = (505, 48, 7), Y_train = (505, 24, 2)
X_val = (25, 48, 7), Y_val = (25, 24, 2)
X_test = (25, 48, 7), Y_test = (25, 24, 2)


In [20]:
carbon_lstm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.322625,0.283766,0.261381,0.243292,0.240131,0.256516,0.289512,0.317089,0.356495,0.389055,...,0.518135,0.537164,0.545459,0.547887,0.561278,0.592277,0.611521,0.618488,0.588946,0.53417
1,0.238519,0.202941,0.19424,0.199984,0.223099,0.259735,0.316785,0.364941,0.419226,0.458326,...,0.534192,0.548919,0.564307,0.582493,0.600838,0.620133,0.60884,0.572113,0.511107,0.448314
2,0.178182,0.150409,0.14818,0.158964,0.188033,0.238968,0.316949,0.374696,0.434091,0.468899,...,0.525302,0.548098,0.576293,0.602475,0.619514,0.625151,0.587685,0.529091,0.457201,0.399275
3,0.136572,0.125149,0.135721,0.157006,0.194024,0.250439,0.331598,0.392885,0.455356,0.493568,...,0.515632,0.534751,0.565099,0.591332,0.601343,0.599173,0.559462,0.502129,0.441595,0.397045
4,0.151637,0.154228,0.175821,0.207623,0.254581,0.316213,0.389154,0.430791,0.470192,0.488937,...,0.493127,0.520971,0.560836,0.587448,0.588878,0.574784,0.531011,0.490313,0.471905,0.47995
5,0.134088,0.154953,0.195047,0.244055,0.304842,0.375709,0.444708,0.47783,0.506198,0.517845,...,0.538089,0.555551,0.570292,0.565229,0.54438,0.519605,0.487252,0.469959,0.483313,0.522411
6,0.173486,0.202228,0.243107,0.288117,0.341906,0.396623,0.451326,0.479276,0.501756,0.509452,...,0.523737,0.533021,0.531523,0.511831,0.485229,0.459925,0.439929,0.4335,0.450034,0.489062
7,0.163781,0.198779,0.247647,0.298702,0.359472,0.423058,0.478388,0.49539,0.509304,0.508302,...,0.510747,0.508485,0.482941,0.435237,0.384079,0.353417,0.355105,0.390441,0.450288,0.519194
8,0.201804,0.248477,0.305061,0.360474,0.419579,0.471522,0.505777,0.511326,0.512781,0.507688,...,0.496248,0.484738,0.459513,0.425553,0.401341,0.398026,0.426545,0.477275,0.543543,0.611395
9,0.257886,0.303544,0.352489,0.395507,0.438567,0.481604,0.506774,0.51342,0.518598,0.52438,...,0.489629,0.460023,0.423861,0.389726,0.384132,0.408113,0.467827,0.537062,0.59959,0.645028


In [21]:
rmse_mlp_carbon = []
rmse_lstm_carbon = []
rmse_gru_carbon = []
rmse_mlp_solar = []
rmse_lstm_solar = []
rmse_gru_solar = []

rmse = 0


for i in range(len(carbon_mlp)):
    current_rmse = sqrt(mean_squared_error(carbon_mlp[i], Y_test[i, :, 0]))
    rmse_mlp_carbon.append(current_rmse)
    rmse += current_rmse
     
rmse /= len(carbon_mlp)    
print(f"RMSE for carbon_mlp = {rmse}")

rmse = 0
for i in range(len(carbon_lstm)):
    current_rmse = sqrt(mean_squared_error(carbon_lstm[i], Y_test[i, :, 0]))
    rmse_lstm_carbon.append(current_rmse)
    rmse += current_rmse

rmse /= len(carbon_lstm)    
print(f"RMSE for carbon_lstm = {rmse}")

rmse = 0
for i in range(len(carbon_gru)):
    current_rmse = sqrt(mean_squared_error(carbon_gru[i], Y_test[i, :, 0]))
    rmse_gru_carbon.append(current_rmse)
    rmse += current_rmse

rmse /= len(carbon_gru)    
print(f"RMSE for carbon_gru = {rmse}")


rmse = 0
for i in range(len(solar_mlp)):
    current_rmse = sqrt(mean_squared_error(solar_mlp[i], Y_test[i, :, 1]))
    rmse_mlp_solar.append(current_rmse)
    rmse += current_rmse
     
rmse /= len(solar_mlp)    
print(f"RMSE for solar_mlp = {rmse}")

rmse = 0
for i in range(len(solar_lstm)):
    current_rmse = sqrt(mean_squared_error(solar_lstm[i], Y_test[i, :, 1]))
    rmse_lstm_solar.append(current_rmse)
    rmse += current_rmse

rmse /= len(solar_lstm)    
print(f"RMSE for solar_lstm = {rmse}")

rmse = 0
for i in range(len(solar_gru)):
    current_rmse = sqrt(mean_squared_error(solar_gru[i], Y_test[i, :, 1]))
    rmse_gru_solar.append(current_rmse)
    rmse += current_rmse

rmse /= len(solar_gru)    
print(f"RMSE for solar_gru = {rmse}")

RMSE for carbon_mlp = 0.13551121057457657
RMSE for carbon_lstm = 0.12928722844756912
RMSE for carbon_gru = 0.12489182755708626
RMSE for solar_mlp = 0.0571350922464098
RMSE for solar_lstm = 0.0594736639187891
RMSE for solar_gru = 0.040305307458519034


In [22]:
from scipy import stats

In [26]:
print(stats.shapiro(rmse_mlp_carbon))
print(stats.shapiro(rmse_lstm_carbon))
print(stats.shapiro(rmse_gru_carbon))
print(stats.shapiro(rmse_mlp_solar))
print(stats.shapiro(rmse_lstm_solar))
print(stats.shapiro(rmse_gru_solar))

ShapiroResult(statistic=0.8778755068778992, pvalue=0.007533067371696234)
ShapiroResult(statistic=0.8310264945030212, pvalue=0.0009912558598443866)
ShapiroResult(statistic=0.8453090190887451, pvalue=0.0017911724280565977)
ShapiroResult(statistic=0.8888959884643555, pvalue=0.012612108141183853)
ShapiroResult(statistic=0.9219475388526917, pvalue=0.06450442224740982)
ShapiroResult(statistic=0.9586084485054016, pvalue=0.4111086130142212)


In [24]:
print(stats.wilcoxon(rmse_mlp_carbon, rmse_lstm_carbon))
print(stats.wilcoxon(rmse_lstm_carbon, rmse_gru_carbon))
print(stats.wilcoxon(rmse_mlp_carbon, rmse_gru_carbon))

print(stats.wilcoxon(rmse_mlp_solar, rmse_lstm_solar))
print(stats.wilcoxon(rmse_lstm_solar, rmse_gru_solar))
print(stats.wilcoxon(rmse_mlp_solar, rmse_gru_solar))

WilcoxonResult(statistic=77.0, pvalue=0.036649227142333984)
WilcoxonResult(statistic=89.0, pvalue=0.08392000198364258)
WilcoxonResult(statistic=74.0, pvalue=0.029131293296813965)
WilcoxonResult(statistic=133.0, pvalue=0.6431422233581543)
WilcoxonResult(statistic=15.0, pvalue=1.633167266845703e-05)
WilcoxonResult(statistic=59.0, pvalue=0.007920503616333008)


In [25]:
print(stats.ttest_rel(rmse_mlp_carbon, rmse_lstm_carbon))
print(stats.ttest_rel(rmse_lstm_carbon, rmse_gru_carbon))
print(stats.ttest_rel(rmse_mlp_carbon, rmse_gru_carbon))

print(stats.ttest_rel(rmse_mlp_solar, rmse_lstm_solar))
print(stats.ttest_rel(rmse_lstm_solar, rmse_gru_solar))
print(stats.ttest_rel(rmse_mlp_solar, rmse_gru_solar))

TtestResult(statistic=2.4387133290976757, pvalue=0.022873259792795926, df=23)
TtestResult(statistic=1.9972082152728636, pvalue=0.05776706373655365, df=23)
TtestResult(statistic=2.7048786667607216, pvalue=0.012638223353874205, df=23)
TtestResult(statistic=-0.3380126363013858, pvalue=0.7384195264161315, df=23)
TtestResult(statistic=5.657976516086405, pvalue=9.262427648351478e-06, df=23)
TtestResult(statistic=3.2595719308877897, pvalue=0.0034496796457513996, df=23)
