In [27]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np


In [28]:
# Carrega o dataset completo com preços, geração, load, e clima
df = pd.read_csv("dataset.csv", parse_dates=['time'])
df.set_index('time', inplace=True)

# Verifica se a coluna de preço está presente
print(df.columns[df.columns.str.contains("price", case=False)])


Index(['price day ahead', 'price actual'], dtype='object')


In [29]:
def add_lag_and_time_features(df):
    df = df.copy()
    df['lag_1h'] = df['price actual'].shift(1)
    df['lag_24h'] = df['price actual'].shift(24)
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    return df.dropna()

df_prepared = add_lag_and_time_features(df)


In [30]:
def create_fixed_yearly_splits(df):
    """
    Cria 3 janelas fixas com treino e teste de 1 ano cada.
    """
    df = df.copy()
    df = df.sort_index()

    splits = []

    ranges = [
        ('2015-01-01', '2015-12-31', '2016-01-01', '2016-12-31'),
        ('2016-01-01', '2016-12-31', '2017-01-01', '2017-12-31'),
        ('2017-01-01', '2017-12-31', '2018-01-01', '2018-12-31')
    ]

    for train_start, train_end, test_start, test_end in ranges:
        df_train = df.loc[train_start:train_end]
        df_test = df.loc[test_start:test_end]
        splits.append((df_train, df_test))

    return splits

splits = create_fixed_yearly_splits(df_prepared)

# Visualizar os períodos
for i, (train, test) in enumerate(splits, start=1):
    print(f"Janela {i}:")
    print(f"  Treino: {train.index.min().date()} → {train.index.max().date()}  ({len(train)} registos)")
    print(f"  Teste:  {test.index.min().date()} → {test.index.max().date()}  ({len(test)} registos)\n")


Janela 1:
  Treino: 2015-01-01 → 2015-12-31  (8737 registos)
  Teste:  2016-01-01 → 2016-12-31  (8784 registos)

Janela 2:
  Treino: 2016-01-01 → 2016-12-31  (8784 registos)
  Teste:  2017-01-01 → 2017-12-31  (8760 registos)

Janela 3:
  Treino: 2017-01-01 → 2017-12-31  (8760 registos)
  Teste:  2018-01-01 → 2018-12-31  (8759 registos)



In [31]:
forecast_horizon = 168
df_prepared['target'] = df_prepared['price actual'].shift(-forecast_horizon)
df_prepared = df_prepared.dropna(subset=['target'])

In [32]:
def evaluate_model_across_splits(splits, model_type='lr', forecast_horizon=168):
    """
    Avalia um modelo em 3 janelas com várias métricas.
    """
    results = []

    for i, (df_train, df_test) in enumerate(splits, start=1):
        # Prepara features e target
        df_train = df_train.copy()
        df_test = df_test.copy()

        df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
        df_test['target'] = df_test['price actual'].shift(-forecast_horizon)

        df_train.dropna(subset=['target'], inplace=True)
        df_test.dropna(subset=['target'], inplace=True)

        drop_cols = ['price actual', 'price day ahead', 'target']
        feature_cols = [col for col in df_train.columns if col not in drop_cols]

        X_train = df_train[feature_cols].values
        y_train = df_train['target'].values
        X_test = df_test[feature_cols].values
        y_test = df_test['target'].values

        # Escolher o scaler adequado
        if model_type == 'lstm':
            scaler_X = MinMaxScaler()
            scaler_y = MinMaxScaler()
        else:
            scaler_X = StandardScaler()
            scaler_y = StandardScaler()

        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)
        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
        y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

        # Escolher o modelo
        if model_type == 'lr':
            model = LinearRegression()
        elif model_type == 'rf':
            model = RandomForestRegressor(n_estimators=100, random_state=42)
        elif model_type == 'xgb':
            model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
        else:
            raise ValueError("Modelo não suportado.")

        model.fit(X_train_scaled, y_train_scaled)
        y_pred_scaled = model.predict(X_test_scaled)
        y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

        # Métricas
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred) * 100
        rmae = mae / np.mean(np.abs(y_test))

        results.append({
            'janela': i,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'rMAE': rmae
        })

    return results


In [33]:
xgb_results = evaluate_model_across_splits(splits, model_type='xgb', forecast_horizon=168)

import pandas as pd
df_xgb = pd.DataFrame(xgb_results)
display(df_xgb)

# Linear Regression
lr_results = evaluate_model_across_splits(splits, model_type='lr', forecast_horizon=168)
df_lr = pd.DataFrame(lr_results)
display(df_lr)

# Random Forest
rf_results = evaluate_model_across_splits(splits, model_type='rf', forecast_horizon=168)
df_rf = pd.DataFrame(rf_results)
display(df_rf)



Unnamed: 0,janela,MAE,RMSE,MAPE (%),rMAE
0,1,13.799471,16.629015,40.197175,0.289723
1,2,6.405461,8.755751,11.368923,0.108381
2,3,6.545475,8.949137,13.109144,0.102603


Unnamed: 0,janela,MAE,RMSE,MAPE (%),rMAE
0,1,12.776209,15.513166,36.386887,0.268239
1,2,5.992934,8.234128,11.166853,0.101401
2,3,7.231624,9.72499,14.202488,0.113358


Unnamed: 0,janela,MAE,RMSE,MAPE (%),rMAE
0,1,12.779714,15.581631,37.805272,0.268313
1,2,6.519287,8.851344,11.669757,0.110307
2,3,6.016541,8.420669,12.197772,0.094312


In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

def tune_random_forest(splits, forecast_horizon=168):
    """
    Faz tuning leve do Random Forest com base nas 3 janelas fixas.
    Testa combinações simples de n_estimators e max_depth.
    Retorna tabela com média das métricas por configuração.
    """
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20]
    }

    results = []

    for n in param_grid['n_estimators']:
        for d in param_grid['max_depth']:
            maes, mapes, rmses, rmaes = [], [], [], []

            for df_train, df_test in splits:
                df_train = df_train.copy()
                df_test = df_test.copy()

                df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
                df_test['target'] = df_test['price actual'].shift(-forecast_horizon)

                df_train.dropna(subset=['target'], inplace=True)
                df_test.dropna(subset=['target'], inplace=True)

                drop_cols = ['price actual', 'price day ahead', 'target']
                feature_cols = [col for col in df_train.columns if col not in drop_cols]

                X_train = df_train[feature_cols].values
                y_train = df_train['target'].values
                X_test = df_test[feature_cols].values
                y_test = df_test['target'].values

                scaler_X = StandardScaler()
                scaler_y = StandardScaler()
                X_train_scaled = scaler_X.fit_transform(X_train)
                X_test_scaled = scaler_X.transform(X_test)
                y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
                y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

                model = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=42)
                model.fit(X_train_scaled, y_train_scaled)
                y_pred_scaled = model.predict(X_test_scaled)
                y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

                mae = mean_absolute_error(y_test, y_pred)
                rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                mape = mean_absolute_percentage_error(y_test, y_pred) * 100
                rmae = mae / np.mean(np.abs(y_test))

                maes.append(mae)
                rmses.append(rmse)
                mapes.append(mape)
                rmaes.append(rmae)

            results.append({
                'n_estimators': n,
                'max_depth': d,
                'MAE': np.mean(maes),
                'RMSE': np.mean(rmses),
                'MAPE (%)': np.mean(mapes),
                'rMAE': np.mean(rmaes)
            })

    return pd.DataFrame(results)

rf_tuning_results = tune_random_forest(splits)
display(rf_tuning_results)


Unnamed: 0,n_estimators,max_depth,MAE,RMSE,MAPE (%),rMAE
0,100,10,8.387829,10.916625,20.51757,0.156755
1,100,20,8.439006,10.955887,20.551324,0.157656
2,200,10,8.396766,10.914608,20.523251,0.156926
3,200,20,8.442393,10.953486,20.563549,0.157782


In [35]:
from xgboost import XGBRegressor

def tune_xgboost(splits, forecast_horizon=168):
    """
    Faz tuning leve do XGBoost com base nas 3 janelas fixas.
    Retorna média das métricas por combinação de hiperparâmetros.
    """
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [6, 8]
    }

    results = []

    for n in param_grid['n_estimators']:
        for lr in param_grid['learning_rate']:
            for d in param_grid['max_depth']:
                maes, mapes, rmses, rmaes = [], [], [], []

                for df_train, df_test in splits:
                    df_train = df_train.copy()
                    df_test = df_test.copy()

                    df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
                    df_test['target'] = df_test['price actual'].shift(-forecast_horizon)

                    df_train.dropna(subset=['target'], inplace=True)
                    df_test.dropna(subset=['target'], inplace=True)

                    drop_cols = ['price actual', 'price day ahead', 'target']
                    feature_cols = [col for col in df_train.columns if col not in drop_cols]

                    X_train = df_train[feature_cols].values
                    y_train = df_train['target'].values
                    X_test = df_test[feature_cols].values
                    y_test = df_test['target'].values

                    scaler_X = StandardScaler()
                    scaler_y = StandardScaler()
                    X_train_scaled = scaler_X.fit_transform(X_train)
                    X_test_scaled = scaler_X.transform(X_test)
                    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
                    y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

                    model = XGBRegressor(n_estimators=n, learning_rate=lr, max_depth=d, random_state=42)
                    model.fit(X_train_scaled, y_train_scaled)
                    y_pred_scaled = model.predict(X_test_scaled)
                    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

                    mae = mean_absolute_error(y_test, y_pred)
                    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
                    rmae = mae / np.mean(np.abs(y_test))

                    maes.append(mae)
                    rmses.append(rmse)
                    mapes.append(mape)
                    rmaes.append(rmae)

                results.append({
                    'n_estimators': n,
                    'learning_rate': lr,
                    'max_depth': d,
                    'MAE': np.mean(maes),
                    'RMSE': np.mean(rmses),
                    'MAPE (%)': np.mean(mapes),
                    'rMAE': np.mean(rmaes)
                })

    return pd.DataFrame(results)

xgb_tuning_results = tune_xgboost(splits)
display(xgb_tuning_results)


Unnamed: 0,n_estimators,learning_rate,max_depth,MAE,RMSE,MAPE (%),rMAE
0,100,0.05,6,8.722027,11.25911,21.283755,0.163511
1,100,0.05,8,8.710895,11.261533,21.152055,0.162746
2,100,0.1,6,8.916803,11.444635,21.558414,0.166902
3,100,0.1,8,8.969084,11.520602,21.76045,0.167809
4,200,0.05,6,8.848607,11.380299,21.531782,0.165876
5,200,0.05,8,8.753945,11.31104,21.223589,0.16346
6,200,0.1,6,9.048125,11.571811,21.755141,0.16911
7,200,0.1,8,8.991113,11.54266,21.781495,0.168119


In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

def evaluate_lstm_on_splits(splits, forecast_horizon=168, timesteps=24, units=50, batch_size=32, epochs=10):
    results = []

    for i, (df_train, df_test) in enumerate(splits, start=1):
        df_train = df_train.copy()
        df_test = df_test.copy()

        df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
        df_test['target'] = df_test['price actual'].shift(-forecast_horizon)

        df_train.dropna(subset=['target'], inplace=True)
        df_test.dropna(subset=['target'], inplace=True)

        drop_cols = ['price actual', 'price day ahead', 'target']
        feature_cols = [col for col in df_train.columns if col not in drop_cols]

        X_train = df_train[feature_cols].values
        y_train = df_train['target'].values
        X_test = df_test[feature_cols].values
        y_test = df_test['target'].values

        # Normalização com MinMaxScaler
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()
        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)
        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
        y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

        # Criação das sequências para LSTM
        def build_sequences(X, y, timesteps):
            Xs, ys = [], []
            for i in range(timesteps, len(X)):
                Xs.append(X[i - timesteps:i])
                ys.append(y[i])
            return np.array(Xs), np.array(ys)

        X_train_seq, y_train_seq = build_sequences(X_train_scaled, y_train_scaled, timesteps)
        X_test_seq, y_test_seq = build_sequences(X_test_scaled, y_test_scaled, timesteps)

        # Modelo LSTM
        model = Sequential()
        model.add(LSTM(units, activation='relu', input_shape=(timesteps, X_train_seq.shape[2])))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size, verbose=0)

        y_pred_scaled = model.predict(X_test_seq).ravel()
        y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
        y_test_real = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).ravel()

        # Métricas
        mae = mean_absolute_error(y_test_real, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test_real, y_pred))
        mape = mean_absolute_percentage_error(y_test_real, y_pred) * 100
        rmae = mae / np.mean(np.abs(y_test_real))

        results.append({
            'janela': i,
            'MAE': mae,
            'RMSE': rmse,
            'MAPE (%)': mape,
            'rMAE': rmae
        })

    return results

lstm_results = evaluate_lstm_on_splits(splits, forecast_horizon=168)
df_lstm = pd.DataFrame(lstm_results)
display(df_lstm)


  super().__init__(**kwargs)


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


Unnamed: 0,janela,MAE,RMSE,MAPE (%),rMAE
0,1,15.412731,17.965089,42.907639,0.323514
1,2,8.120766,10.45135,14.269063,0.137457
2,3,10.859065,13.083367,18.931895,0.170273


In [37]:
def tune_lstm(splits, forecast_horizon=168, timesteps=24, epochs=10):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense
    from sklearn.preprocessing import MinMaxScaler
    import numpy as np

    param_grid = {
        'units': [50, 100],
        'batch_size': [16, 32]
    }

    results = []

    for units in param_grid['units']:
        for batch_size in param_grid['batch_size']:
            maes, mapes, rmses, rmaes = [], [], [], []

            for df_train, df_test in splits:
                df_train = df_train.copy()
                df_test = df_test.copy()

                df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
                df_test['target'] = df_test['price actual'].shift(-forecast_horizon)

                df_train.dropna(subset=['target'], inplace=True)
                df_test.dropna(subset=['target'], inplace=True)

                drop_cols = ['price actual', 'price day ahead', 'target']
                feature_cols = [col for col in df_train.columns if col not in drop_cols]

                X_train = df_train[feature_cols].values
                y_train = df_train['target'].values
                X_test = df_test[feature_cols].values
                y_test = df_test['target'].values

                scaler_X = MinMaxScaler()
                scaler_y = MinMaxScaler()
                X_train_scaled = scaler_X.fit_transform(X_train)
                X_test_scaled = scaler_X.transform(X_test)
                y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
                y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

                # Sequências
                def build_sequences(X, y, timesteps):
                    Xs, ys = [], []
                    for i in range(timesteps, len(X)):
                        Xs.append(X[i - timesteps:i])
                        ys.append(y[i])
                    return np.array(Xs), np.array(ys)

                X_train_seq, y_train_seq = build_sequences(X_train_scaled, y_train_scaled, timesteps)
                X_test_seq, y_test_seq = build_sequences(X_test_scaled, y_test_scaled, timesteps)

                model = Sequential()
                model.add(LSTM(units, activation='relu', input_shape=(timesteps, X_train_seq.shape[2])))
                model.add(Dense(1))
                model.compile(optimizer='adam', loss='mse')
                model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size, verbose=0)

                y_pred_scaled = model.predict(X_test_seq).ravel()
                y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
                y_test_real = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).ravel()

                mae = mean_absolute_error(y_test_real, y_pred)
                rmse = np.sqrt(mean_squared_error(y_test_real, y_pred))
                mape = mean_absolute_percentage_error(y_test_real, y_pred) * 100
                rmae = mae / np.mean(np.abs(y_test_real))

                maes.append(mae)
                rmses.append(rmse)
                mapes.append(mape)
                rmaes.append(rmae)

            results.append({
                'units': units,
                'batch_size': batch_size,
                'MAE': np.mean(maes),
                'RMSE': np.mean(rmses),
                'MAPE (%)': np.mean(mapes),
                'rMAE': np.mean(rmaes)
            })

    return pd.DataFrame(results)

lstm_tuning_results = tune_lstm(splits)
display(lstm_tuning_results)


  super().__init__(**kwargs)


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


Unnamed: 0,units,batch_size,MAE,RMSE,MAPE (%),rMAE
0,50,16,10.127373,12.504681,22.996347,0.186857
1,50,32,11.335515,14.056716,26.302912,0.210037
2,100,16,10.697724,13.173039,25.055857,0.199013
3,100,32,11.384304,14.023839,26.369113,0.212262


In [38]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import numpy as np

# Função do teste (como antes)
def diebold_mariano_manual(e1, e2, h=1):
    d = e1 - e2
    d_mean = np.mean(d)
    d_var = np.var(d, ddof=1)
    n = len(d)
    DM_stat = d_mean / np.sqrt(d_var / n)
    from scipy.stats import t
    p_value = 2 * (1 - t.cdf(np.abs(DM_stat), df=n - 1))
    return DM_stat, p_value

# ============
# PREPARAÇÃO
# ============
df_train, df_test = splits[2]  # Janela 3
df_train = df_train.copy()
df_test = df_test.copy()
forecast_horizon = 168

df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
df_test['target'] = df_test['price actual'].shift(-forecast_horizon)
df_train.dropna(subset=['target'], inplace=True)
df_test.dropna(subset=['target'], inplace=True)

drop_cols = ['price actual', 'price day ahead', 'target']
feature_cols = [col for col in df_train.columns if col not in drop_cols]

X_train = df_train[feature_cols].values
y_train = df_train['target'].values
X_test = df_test[feature_cols].values
y_test = df_test['target'].values

# ============
# XGBOOST
# ============
scaler_X_xgb = StandardScaler()
scaler_y_xgb = StandardScaler()
X_train_xgb = scaler_X_xgb.fit_transform(X_train)
X_test_xgb = scaler_X_xgb.transform(X_test)
y_train_xgb = scaler_y_xgb.fit_transform(y_train.reshape(-1, 1)).ravel()

model_xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model_xgb.fit(X_train_xgb, y_train_xgb)
y_pred_xgb_scaled = model_xgb.predict(X_test_xgb)
y_pred_xgb = scaler_y_xgb.inverse_transform(y_pred_xgb_scaled.reshape(-1, 1)).ravel()

# ============
# LSTM
# ============
scaler_X_lstm = MinMaxScaler()
scaler_y_lstm = MinMaxScaler()
X_train_lstm = scaler_X_lstm.fit_transform(X_train)
X_test_lstm = scaler_X_lstm.transform(X_test)
y_train_lstm = scaler_y_lstm.fit_transform(y_train.reshape(-1, 1)).ravel()

# Função para criar sequências
def build_sequences(X, y, timesteps=24):
    Xs, ys = [], []
    for i in range(timesteps, len(X)):
        Xs.append(X[i - timesteps:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

X_train_seq, y_train_seq = build_sequences(X_train_lstm, y_train_lstm, timesteps=24)
X_test_seq, y_test_seq = build_sequences(X_test_lstm, scaler_y_lstm.transform(y_test.reshape(-1, 1)).ravel(), timesteps=24)

model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(24, X_train_seq.shape[2])))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32, verbose=0)

y_pred_lstm_scaled = model_lstm.predict(X_test_seq).ravel()
y_pred_lstm = scaler_y_lstm.inverse_transform(y_pred_lstm_scaled.reshape(-1, 1)).ravel()
y_true = scaler_y_lstm.inverse_transform(y_test_seq.reshape(-1, 1)).ravel()

# ============
# DIEBOLD-MARIANO TEST
# ============
e_xgb = np.abs(y_true - y_pred_xgb[-len(y_true):])  # alinhar tamanhos
e_lstm = np.abs(y_true - y_pred_lstm)

dm_stat, p_val = diebold_mariano_manual(e_lstm, e_xgb, h=1)
print(f"📊 Diebold-Mariano Statistic: {dm_stat:.4f}")
print(f"📌 p-value: {p_val:.4f}")


  super().__init__(**kwargs)


[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
📊 Diebold-Mariano Statistic: 38.9308
📌 p-value: 0.0000


In [39]:
from sklearn.ensemble import RandomForestRegressor

# ============
# RANDOM FOREST
# ============
scaler_X_rf = StandardScaler()
scaler_y_rf = StandardScaler()

X_train_rf = scaler_X_rf.fit_transform(X_train)
X_test_rf = scaler_X_rf.transform(X_test)
y_train_rf = scaler_y_rf.fit_transform(y_train.reshape(-1, 1)).ravel()

model_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
model_rf.fit(X_train_rf, y_train_rf)

y_pred_rf_scaled = model_rf.predict(X_test_rf)
y_pred_rf = scaler_y_rf.inverse_transform(y_pred_rf_scaled.reshape(-1, 1)).ravel()

# Alinhar y_true com y_pred_rf
y_true_rf = scaler_y_rf.inverse_transform(y_test.reshape(-1, 1)).ravel()

# ============
# DIEBOLD–MARIANO: RF vs XGBoost
# ============
# Alinhar tamanhos (caso necessário)
min_len = min(len(y_true_rf), len(y_pred_xgb))

e_rf = np.abs(y_true_rf[-min_len:] - y_pred_rf[-min_len:])
e_xgb = np.abs(y_true_rf[-min_len:] - y_pred_xgb[-min_len:])

dm_stat, p_val = diebold_mariano_manual(e_rf, e_xgb, h=1)

print(f"📊 Diebold-Mariano Statistic (RF vs XGB): {dm_stat:.4f}")
print(f"📌 p-value: {p_val:.4f}")


📊 Diebold-Mariano Statistic (RF vs XGB): -3.4012
📌 p-value: 0.0007


In [None]:
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Preparar treino a partir da Janela 3
df_train, df_test = splits[2]  # Janela 3
df_train = df_train.copy()

# Criar target com 168h (7 dias)
forecast_horizon = 168
df_train['target'] = df_train['price actual'].shift(-forecast_horizon)
df_train.dropna(subset=['target'], inplace=True)

# Selecionar features
drop_cols = ['price actual', 'price day ahead', 'target']
feature_cols = [col for col in df_train.columns if col not in drop_cols]
train_X = df_train[feature_cols].values
train_y = df_train['target'].values

# Escalar
from sklearn.preprocessing import StandardScaler
scaler_X = StandardScaler()
scaler_y = StandardScaler()
train_X = scaler_X.fit_transform(train_X)
train_y = scaler_y.fit_transform(train_y.reshape(-1, 1)).ravel()

# Treinar o modelo com os dados disponíveis
xgb_model_long = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model_long.fit(train_X, train_y)

latest_input = df_test[-24:].copy()

# Preparar a sequência mais recente
forecast_168h = []
current_input = latest_input.copy()

for step in range(168):
    # Escalar e prever a próxima hora
    input_scaled = scaler_X.transform(current_input[feature_cols])
    pred_scaled = xgb_model_long.predict(input_scaled[-1:])
    pred_real = scaler_y.inverse_transform(pred_scaled.reshape(-1, 1)).flatten()[0]
    forecast_168h.append(pred_real)

    # Atualizar linha seguinte com novos valores
    next_row = current_input.iloc[-1:].copy()
    next_row['lag_1h'] = pred_real
    next_row['lag_24h'] = current_input.iloc[-24]['lag_1h'] if 'lag_1h' in current_input.columns else np.nan

    # Atualizar variáveis temporais
    next_row['hour'] = (next_row['hour'] + 1) % 24
    next_row['hour_sin'] = np.sin(2 * np.pi * next_row['hour'] / 24)
    next_row['hour_cos'] = np.cos(2 * np.pi * next_row['hour'] / 24)
    next_row['day_of_week'] = (next_row['day_of_week'] + (next_row['hour'] == 0).astype(int)) % 7
    next_row['dow_sin'] = np.sin(2 * np.pi * next_row['day_of_week'] / 7)
    next_row['dow_cos'] = np.cos(2 * np.pi * next_row['day_of_week'] / 7)

    # Atualizar sequência
    current_input = pd.concat([current_input.iloc[1:], next_row])

# Plot do resultado
plt.figure(figsize=(14, 5))
plt.plot(range(1, 169), forecast_168h, marker='o', label='XGBoost Forecast')
plt.title("XGBoost Forecast — Next 7 Days (168h Horizon)")
plt.xlabel("Hour Ahead")
plt.ylabel("Forecasted Price (€/MWh)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


NameError: name 'train_X' is not defined