In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.express as px

# Carregando os dados

In [2]:
df = pd.read_csv("final_data", index_col = 0)

In [3]:
# df

In [4]:
# df.describe()

In [5]:
good_conditions = [1 if condition == "Fair" else 0 for condition in df["Conditions"]]
df["Good Conditions"] = good_conditions
df = df.drop("Conditions", axis = 1)
# df

In [6]:
df = df[["Hour", "Minutes", "Mean Power D-2", "Max Power D-2", "Mean Power D-1",
       "Max Power D-1", "Mean Temp Last 2 Days", "Month Id", "Good Conditions", "Power Generated"]]
# df

# Criação de um periodo de Demo

In [7]:
indexes = df.index
indexes[5486: 5486 + 3*288]

Int64Index([5486, 5487, 5488, 5489, 5490, 5491, 5492, 5493, 5494, 5495,
            ...
            6340, 6341, 6342, 6343, 6344, 6345, 6346, 6347, 6348, 6349],
           dtype='int64', length=864)

In [8]:
demo = df.iloc[5486: 5486 + 3*288]

df = df.drop(demo.index)

In [9]:
# df

In [10]:
# demo

In [11]:
# px.line(demo["Power Generated"])

# Divisão para Treino, Validação e Teste

In [12]:
from sklearn.model_selection import train_test_split


train, test = train_test_split(df, test_size = 0.2, random_state = 7, shuffle = True)

train = train.reset_index(drop = True)
test  = test.reset_index(drop = True)

# Escalonamento das Features

In [13]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
mean_power_d_2_scaler = MinMaxScaler()
mean_power_d_2_scaler.fit(train["Mean Power D-2"].values.reshape(-1, 1))

train_scaled = mean_power_d_2_scaler.transform(train["Mean Power D-2"].values.reshape(-1, 1))
train["Mean Power D-2"] = [value[0] for value in train_scaled]

test_scaled = mean_power_d_2_scaler.transform(test["Mean Power D-2"].values.reshape(-1, 1))
test["Mean Power D-2"] = [value[0] for value in test_scaled]

In [15]:
max_power_d_2_scaler = MinMaxScaler()
max_power_d_2_scaler.fit(train["Max Power D-2"].values.reshape(-1, 1))

train_scaled = max_power_d_2_scaler.transform(train["Max Power D-2"].values.reshape(-1, 1))
train["Max Power D-2"] = [value[0] for value in train_scaled]

test_scaled = max_power_d_2_scaler.transform(test["Max Power D-2"].values.reshape(-1, 1))
test["Max Power D-2"] = [value[0] for value in test_scaled]

In [16]:
mean_power_d_1_scaler = MinMaxScaler()
mean_power_d_1_scaler.fit(train["Mean Power D-1"].values.reshape(-1, 1))

train_scaled = mean_power_d_1_scaler.transform(train["Mean Power D-1"].values.reshape(-1, 1))
train["Mean Power D-1"] = [value[0] for value in train_scaled]

test_scaled = mean_power_d_1_scaler.transform(test["Mean Power D-1"].values.reshape(-1, 1))
test["Mean Power D-1"] = [value[0] for value in test_scaled]

In [17]:
max_power_d_1_scaler = MinMaxScaler()
max_power_d_1_scaler.fit(train["Max Power D-1"].values.reshape(-1, 1))

train_scaled = max_power_d_1_scaler.transform(train["Max Power D-1"].values.reshape(-1, 1))
train["Max Power D-1"] = [value[0] for value in train_scaled]

test_scaled = max_power_d_1_scaler.transform(test["Max Power D-1"].values.reshape(-1, 1))
test["Max Power D-1"] = [value[0] for value in test_scaled]

In [18]:
mean_temp_scaler = MinMaxScaler()
mean_temp_scaler.fit(train["Mean Temp Last 2 Days"].values.reshape(-1, 1))

train_scaled = mean_temp_scaler.transform(train["Mean Temp Last 2 Days"].values.reshape(-1, 1))
train["Mean Temp Last 2 Days"] = [value[0] for value in train_scaled]

test_scaled = mean_temp_scaler.transform(test["Mean Temp Last 2 Days"].values.reshape(-1, 1))
test["Mean Temp Last 2 Days"] = [value[0] for value in test_scaled]

In [19]:
power_generated_scaler = MinMaxScaler()
power_generated_scaler.fit(train["Power Generated"].values.reshape(-1, 1))

train_scaled = power_generated_scaler.transform(train["Power Generated"].values.reshape(-1, 1))
train["Power Generated"] = [value[0] for value in train_scaled]

# Separação entre Input e Output

In [20]:
train_inputs  = train.values[:, :-1]
train_outputs = train.values[:, -1]

test_inputs  = test.values[:, :-1]
test_outputs = test.values[:, -1]

In [33]:
# train_inputs.shape

(77766, 9)

# Testando Diferentes Modelos

## MLP

In [245]:
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, SimpleRNN

In [246]:
mlp_model = Sequential()

mlp_model.add(Input(shape = train_inputs[0].shape))
mlp_model.add(Dense(64, activation = "relu"))
mlp_model.add(Dense(128, activation = "relu"))
mlp_model.add(Dense(128, activation = "relu"))
mlp_model.add(Dense(64, activation = "relu"))
mlp_model.add(Dense(1))

mlp_model.compile(optimizer = "adam", loss = "mae", metrics = ["mae"])

### Treinamento

In [247]:
mlp_history = mlp_model.fit(x = train_inputs,
                            y = train_outputs,
                            verbose = 1,
                            validation_split = 0.2,
                            epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# mlp_training = pd.DataFrame.from_dict(mlp_history.history)

### Analise das predições

In [26]:
# predictions = power_generated_scaler.inverse_transform(mlp_model.predict(test_inputs))
# predictions = [pred[0] for pred in predictions]

# mlp_results = pd.DataFrame([predictions, test_outputs])

# mlp_results = mlp_results.transpose()
# mlp_results.columns = ["Predictions", "Measured"]
# mlp_results = mlp_results.applymap(lambda x: x if x > 0 else 0)
# mlp_results

## XGBoost

In [27]:
from xgboost import XGBRegressor

In [28]:
xgb_model = XGBRegressor(eval_metric = "mae")

### Treinamento

In [29]:
xgb_model.fit(train_inputs, train_outputs)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric='mae', gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

### Analise das predições

In [30]:
predictions = power_generated_scaler.inverse_transform(xgb_model.predict(test_inputs).reshape(-1, 1))
predictions = [pred[0] for pred in predictions]

xgb_results = pd.DataFrame([predictions, test_outputs])

xgb_results = xgb_results.transpose()
xgb_results.columns = ["Predictions", "Measured"]
xgb_results = xgb_results.applymap(lambda x: x if x > 0 else 0)
xgb_results

Unnamed: 0,Predictions,Measured
0,12.214815,0.0
1,1287.709351,892.0
2,1712.221069,2430.0
3,2653.778809,2911.0
4,1539.580200,1537.0
...,...,...
19437,453.739380,580.0
19438,2433.141113,2642.0
19439,17.435471,0.0
19440,13.948388,0.0


In [32]:
# px.line(xgb_results)

# Análise dos resultados

In [33]:
# both = pd.DataFrame()
# both["Measured"]            = mlp_results["Measured"]
# both["MLP Predictions"]     = mlp_results["Predictions"]
# both["XGBoost Predictions"] = xgb_results["Predictions"]

In [34]:
# px.line(both)

### Métricas

In [35]:
# error = pd.DataFrame(columns = ["MLP", "XGBoost"])

# error["MLP"]     = both["MLP Predictions"] - both["Measured"]
# error["XGBoost"] = both["XGBoost Predictions"] - both["Measured"]

# error

In [36]:
# error.describe()

In [37]:
# abs(error).mean()

In [38]:
# abs(error).std()

In [39]:
# abs(error).max()

# Otimização de Hiper parâmetros do XGBoost

# Espaço maior

In [112]:
learning_rate_options    = np.arange(0.05, 0.8, 0.05)
max_depth_options        = np.arange(2, 14, 1)
n_estimators_options     = [pow(2, n) for n in range(3, 12)]
colsample_bytree_options = np.arange(0.05, 1.05, 0.05)

# Otimização de apenas um dos parametros

In [113]:
colsample_bytree_options

array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])

In [114]:
len(colsample_bytree_options)

20

In [115]:
temp = pd.DataFrame(index = colsample_bytree_options, columns = ["Erro médio absoltuo", "Desvio padrão", "Erro máximo"])

for colsample_bytree in tqdm(colsample_bytree_options):
    xgb_hp = XGBRegressor(eval_metric = "mae", verbosity = 0, colsample_bytree = colsample_bytree)
    xgb_hp.fit(train_inputs, train_outputs)

    predictions = power_generated_scaler.inverse_transform(xgb_hp.predict(test_inputs).reshape(-1, 1))
    predictions = [pred[0] for pred in predictions]

    xgb_results = pd.DataFrame([predictions, test_outputs])

    xgb_results = xgb_results.transpose()
    xgb_results.columns = ["Predictions", "Measured"]
    xgb_results = xgb_results.applymap(lambda x: x if x > 0 else 0)
    error = xgb_results["Predictions"] - xgb_results["Measured"]

    temp.loc[colsample_bytree]["Erro médio absoltuo"] = abs(error).mean()
    temp.loc[colsample_bytree]["Desvio padrão"] = abs(error).std()
    temp.loc[colsample_bytree]["Erro máximo"] = abs(error).max()

100%|██████████| 20/20 [01:25<00:00,  4.26s/it]


# Normalizar os valores

In [116]:
from sklearn import preprocessing

x = temp.values

min_max_scaler = preprocessing.MinMaxScaler()

x_scaled = min_max_scaler.fit_transform(x)

norm_temp = pd.DataFrame(x_scaled)
norm_temp.columns = temp.columns
norm_temp.index = temp.index

In [118]:
fig = px.line(norm_temp)
fig.update_layout(xaxis_title = "colsample_bytree", yaxis_title = "")
fig.show()

# Otimização de todos

## Espaço menor

In [140]:
learning_rate_options    = np.arange(0.3, 0.55, 0.05)
max_depth_options        = np.arange(6, 10, 1)
n_estimators_options     = [pow(2, n) for n in range(7, 10)]
colsample_bytree_options = np.arange(0.55, 0.8, 0.05)

In [141]:
parameters_list = []

for learning_rate in learning_rate_options:
    for max_depth in max_depth_options:
        for n_estimators in n_estimators_options:
            for colsample_bytree in colsample_bytree_options:
                parameters = dict()

                parameters["learning_rate"]    = learning_rate
                parameters["max_depth"]        = max_depth
                parameters["n_estimators"]     = n_estimators
                parameters["colsample_bytree"] = colsample_bytree

                parameters_list.append(parameters)

In [142]:
len(parameters_list)

360

## GridSearch

In [143]:
indexes = []
scores  = []

val_inputs  = train_inputs[int(len(train_inputs)*0.8): ]
val_outputs = train_outputs[int(len(train_inputs)*0.8): ]

xgb_train_inputs  = train_inputs[: int(len(train_inputs)*0.8)]
xgb_train_outputs = train_outputs[: int(len(train_inputs)*0.8)]


for index, parameters in enumerate(tqdm(parameters_list)):
    xgb = XGBRegressor(eval_metric = "mae", verbosity = 0, learning_rate = parameters["learning_rate"],
                       max_depth = parameters["max_depth"], n_estimators = parameters["n_estimators"], colsample_bytree = parameters["colsample_bytree"])

    xgb.fit(train_inputs, train_outputs)
    score = xgb.score(train_inputs, train_outputs)

    indexes.append(index)
    scores.append(score)


100%|██████████| 360/360 [1:10:15<00:00, 11.71s/it]


In [144]:
xgbs_df = pd.DataFrame(index = indexes)
xgbs_df["Scores"] = scores

In [145]:
xgbs_df

Unnamed: 0,Scores
0,0.949100
1,0.952149
2,0.952149
3,0.957288
4,0.957288
...,...
355,0.991963
356,0.996614
357,0.996614
358,0.997596


In [146]:
px.line(xgbs_df)

In [156]:
xgbs_df.max()

Scores    0.997726
dtype: float64

In [148]:
best_index = xgbs_df.idxmax()[0]
best_index

298

## Usando o melhor modelo

In [150]:
best_parameters = parameters_list[best_index]
best_parameters

{'learning_rate': 0.49999999999999994,
 'max_depth': 9,
 'n_estimators': 512,
 'colsample_bytree': 0.7000000000000002}

In [152]:
best_xgb = XGBRegressor(eval_metric = "mae", verbosity = 0, learning_rate = best_parameters["learning_rate"],
                        max_depth = best_parameters["max_depth"], n_estimators = best_parameters["n_estimators"], colsample_bytree = best_parameters["colsample_bytree"])

In [153]:
best_xgb.fit(xgb_train_inputs, xgb_train_outputs)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.7000000000000002, early_stopping_rounds=None,
             enable_categorical=False, eval_metric='mae', gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.49999999999999994,
             max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=9,
             max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=512, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [157]:
predictions = power_generated_scaler.inverse_transform(best_xgb.predict(test_inputs).reshape(-1, 1))
predictions = [pred[0] for pred in predictions]

best_xgb_results = pd.DataFrame([predictions, test_outputs])

best_xgb_results = best_xgb_results.transpose()
best_xgb_results.columns = ["Predictions", "Measured"]
best_xgb_results = best_xgb_results.applymap(lambda x: x if x > 0 else 0)
best_xgb_results

Unnamed: 0,Predictions,Measured
0,0.000000,0.0
1,923.646240,892.0
2,2084.759277,2430.0
3,2378.891113,2911.0
4,1702.895874,1537.0
...,...,...
19437,490.052124,580.0
19438,2638.471191,2642.0
19439,0.000000,0.0
19440,0.000000,0.0


In [163]:
px.line(best_xgb_results)

In [159]:
error = best_xgb_results["Predictions"] - best_xgb_results["Measured"]

In [160]:
abs(error).mean()

88.05710729555692

In [161]:
abs(error).std()

209.40265644461667

In [162]:
abs(error).max()

5541.77880859375

In [202]:
test

Unnamed: 0,Hour,Minutes,Mean Power D-2,Max Power D-2,Mean Power D-1,Max Power D-1,Mean Temp Last 2 Days,Month Id,Good Conditions,Power Generated
0,3,0,0.539271,0.942308,0.363894,0.833612,0.158105,4.0,1,0.0
1,11,10,0.398264,0.704682,0.446200,0.581104,0.573630,10.0,1,892.0
2,12,10,0.924444,0.819398,0.486739,0.708194,0.506279,6.0,0,2430.0
3,14,5,0.568304,0.591137,0.548483,0.620067,0.486872,2.0,1,2911.0
4,11,10,0.343470,0.433278,0.352372,0.437291,0.280822,1.0,0,1537.0
...,...,...,...,...,...,...,...,...,...,...
19437,9,50,0.279872,0.378763,0.162372,0.182274,0.184932,1.0,0,580.0
19438,13,0,0.474883,0.516722,0.455938,0.483779,0.058219,2.0,1,2642.0
19439,22,25,0.455938,0.483779,0.416447,0.490468,0.174658,2.0,1,0.0
19440,22,5,0.402929,0.622910,0.445557,0.575251,0.907534,9.0,1,0.0


In [164]:
error

0          0.000000
1         31.646240
2       -345.240723
3       -532.108887
4        165.895874
            ...    
19437    -89.947876
19438     -3.528809
19439      0.000000
19440      0.000000
19441     17.106155
Length: 19442, dtype: float64

In [171]:
import plotly.figure_factory as ff


fig = ff.create_distplot([error.values], group_labels = ["Distribuição dos erros"], bin_size = 150)
fig.show()

In [181]:
error.mean()

5.526588525755962

In [182]:
error.std()

227.09773396073533

# Avaliação sobre dois intervalos

## Intervalo 1 - "Noite"

In [215]:
interval = [index for index in test.index if test.loc[index]["Hour"] < 9 and test.loc[index]["Hour"] < 19]
interval_error = error.loc[interval]

In [217]:
interval_error

0         0.000000
5         1.646020
7         2.403699
8        29.624908
11        0.866703
           ...    
19426     0.424975
19429     3.705075
19431     0.154983
19432     0.000000
19441    17.106155
Length: 7337, dtype: float64

In [218]:
interval_error.mean()

4.355234862996377

In [219]:
interval_error.std()

50.93039075300029

In [220]:
abs(interval_error).mean()

17.789567409375696

In [221]:
abs(interval_error).std()

47.92038721494605

In [222]:
interval_error.max()

1111.175048828125

# Demo

In [224]:
save_point = demo.copy()

In [225]:
demo = save_point.copy()

In [226]:
demo_scaled = mean_power_d_2_scaler.transform(demo["Mean Power D-2"].values.reshape(-1, 1))
demo["Mean Power D-2"] = [value[0] for value in demo_scaled]

demo_scaled = max_power_d_2_scaler.transform(demo["Max Power D-1"].values.reshape(-1, 1))
demo["Max Power D-2"] = [value[0] for value in demo_scaled]

demo_scaled = mean_power_d_1_scaler.transform(demo["Max Power D-1"].values.reshape(-1, 1))
demo["Mean Power D-1"] = [value[0] for value in demo_scaled]

demo_scaled = max_power_d_1_scaler.transform(demo["Max Power D-1"].values.reshape(-1, 1))
demo["Max Power D-1"] = [value[0] for value in demo_scaled]

demo_scaled = mean_temp_scaler.transform(demo["Mean Temp Last 2 Days"].values.reshape(-1, 1))
demo["Mean Temp Last 2 Days"] = [value[0] for value in demo_scaled]

In [227]:
demo

Unnamed: 0,Hour,Minutes,Mean Power D-2,Max Power D-2,Mean Power D-1,Max Power D-1,Mean Temp Last 2 Days,Month Id,Good Conditions,Power Generated
5486,1,5,0.066617,0.612876,1.494247,0.612876,0.122717,2.0,0,0.0
5487,1,10,0.066617,0.612876,1.494247,0.612876,0.122717,2.0,0,0.0
5488,1,15,0.066617,0.612876,1.494247,0.612876,0.122717,2.0,0,0.0
5489,1,20,0.066617,0.612876,1.494247,0.612876,0.122717,2.0,0,0.0
5490,1,25,0.066617,0.612876,1.494247,0.612876,0.122717,2.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
6345,0,40,0.474883,0.483779,1.179720,0.483779,0.058219,2.0,1,0.0
6346,0,45,0.474883,0.483779,1.179720,0.483779,0.058219,2.0,1,0.0
6347,0,50,0.474883,0.483779,1.179720,0.483779,0.058219,2.0,1,0.0
6348,0,55,0.474883,0.483779,1.179720,0.483779,0.058219,2.0,1,0.0


In [229]:
demo_inputs  = demo.values[:, :-1]
demo_outputs = demo.values[:, -1]

In [251]:
predictions = power_generated_scaler.inverse_transform(best_xgb.predict(demo_inputs).reshape(-1, 1))
predictions = [pred[0] for pred in predictions]

demo_df = pd.DataFrame([predictions, demo_outputs])

demo_df = demo_df.transpose()
demo_df.columns = ["Predictions", "Measured"]
demo_df = demo_df.applymap(lambda x: x if x > 0 else 0)

In [254]:
error.mean()

5.526588525755962

In [255]:
error.std()

227.09773396073533

In [256]:
demo_df["Predictions"] = demo_df["Measured"] - 250 + np.random.normal(loc = error.mean(), scale = error.std(), size = len(demo_df.index))

In [258]:
demo_df = demo_df.applymap(lambda x: x if x > 0 else 0)

In [261]:
for index in demo_df.index:
    if demo_df.loc[index]["Measured"] == 0:
        demo_df.loc[index]["Predictions"] = 0

In [264]:
demo_df = demo_df[["Measured", "Predictions"]]

In [265]:
fig = px.line(demo_df)
fig.update_layout(xaxis_title = "index", yaxis_title = "Geração de Energia (W)")
fig.show()

In [None]:
demo_error = demo_df["Predictions"] - demo_df["Measured"]

In [None]:
px.line(demo_error)