1) Imports, Config & Setup

In [12]:
from __future__ import annotations
import os, math, json, warnings
import os
from typing import Optional, Tuple, Dict, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
warnings.filterwarnings("ignore", category=FutureWarning)

# Reprodutibilidade
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Pasta de dados e de saída
DATA_DIR = os.path.join(os.getcwd(), "data")
OUT_DIR = os.path.join(os.getcwd(), "out")
os.makedirs(OUT_DIR, exist_ok=True)

# Hiperparâmetros globais (você pode ajustar depois)
CFG = {
    "apply_log": False,
    "split_ratios": (0.5, 0.25, 0.25),
    "freq": None,              # ex: "D","W","M" ou None p/ inferir
    "force_regular": False,    # True => reindexa pela frequência e interpola

    # Sazonalidade (se souber, informe; senão deixe None)
    "seasonal_period": None,   # ex: 7, 12, 24, 52

    # Busca ARIMA/SARIMA
    "arima_pdq_grid": [(0,1,1), (1,1,1), (2,1,1)],
    "arima_PDQ_grid": [(0,0,0), (1,1,1)],

    # KNN
    "lags_grid": [12],               # nº máximo de lags p/ features
    "rolling_feats": [None, 6],      # stats móveis
    "knn_n_neighbors": [5, 15, 25],
    "knn_weights": ["uniform", "distance"],
    "knn_p": [1, 2],

    # VGG-1D
    "use_vgg": True,
    "vgg_windows": [12, 24],
    "vgg_filters": [32, 64],
    "vgg_dropout": [0.0, 0.2],
    "vgg_epochs": 40,
    "vgg_batch": 32,
    "vgg_patience": 6,

    # Híbridos
    "hybrid_residual": True,
    "hybrid_ensemble": True,
    "ensemble_weight_step": 0.1,
}

2) Utilidades de E/S, métricas e gráficos

In [14]:
# %%
def mse(y, yhat):
    from sklearn.metrics import mean_squared_error
    return float(mean_squared_error(y, yhat))

def mape(y, yhat, eps=1e-8):
    y, yhat = np.asarray(y), np.asarray(yhat)
    return float(np.mean(np.abs((y - yhat) / np.maximum(np.abs(y), eps))) * 100)

def save_plot(name):
    path = os.path.join(OUT_DIR, f"{name}.png")
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()

def plot_fit(idx, y_true, y_pred, title, name):
    plt.figure(figsize=(10,4))
    plt.plot(idx, y_true, label="Observado")
    plt.plot(idx, y_pred, label="Previsto")
    plt.title(title); plt.xlabel("Tempo"); plt.legend()
    save_plot(name)

def export_json(obj: dict, name: str):
    with open(os.path.join(OUT_DIR, f"{name}.json"), "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


3) Descobrir datasets e perguntar qual usar

In [15]:
# %%
def list_datasets(data_dir=DATA_DIR):
    files = []
    for fn in os.listdir(data_dir):
        if fn.lower().endswith((".xlsx", ".csv")):
            files.append(os.path.join(data_dir, fn))
    files.sort()
    return files

files = list_datasets()
assert files, "Nenhum arquivo .xlsx/.csv encontrado em ./data"

print("Datasets encontrados:\n")
for i, f in enumerate(files):
    print(f"{i}: {os.path.basename(f)}")

# Pergunta ao usuário qual usar
while True:
    try:
        choice = int(input("\nDigite o índice do dataset desejado: ").strip())
        assert 0 <= choice < len(files)
        DATA_PATH = files[choice]
        break
    except Exception:
        print("Índice inválido. Tente novamente.")

print(f"\nVocê escolheu: {os.path.basename(DATA_PATH)}")

Datasets encontrados:

0: chuva_fortaleza.xlsx
1: dengue_pernambuco.xlsx
2: solar france.xlsx

Você escolheu: solar france.xlsx


4) Detecção automática das colunas (data e alvo) + carregamento

In [16]:
# %%
from datetime import datetime

TARGET_HINTS = [
    # comuns em PT/EN
    "valor","value","target","y","serie","series","mm","chuva","precipit","precip","rain",
    "kwh","energia","power","load","demand","production","cases","casos","incid","count"
]

def guess_datetime_col(df: pd.DataFrame) -> Optional[str]:
    # heurística: 1ª coluna que converte para datetime com sucesso e com poucos NaNs
    best = None
    for c in df.columns:
        try:
            s = pd.to_datetime(df[c], errors="raise")
            if s.notna().mean() > 0.9:
                best = c
                break
        except:
            continue
    return best

def guess_target_col(df: pd.DataFrame, date_col: Optional[str]) -> str:
    # 1) preferir nomes que batem com TARGET_HINTS e sejam numéricos
    lower_map = {c: c.lower() for c in df.columns}
    for c in df.columns:
        lc = lower_map[c]
        if any(h in lc for h in TARGET_HINTS) and pd.api.types.is_numeric_dtype(df[c]):
            return c
    # 2) se não achou, pegar a coluna numérica com maior variância (ignora a de data)
    cand = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != date_col]
    if not cand:
        raise ValueError("Não há colunas numéricas para usar como alvo.")
    return max(cand, key=lambda col: np.nanvar(pd.to_numeric(df[col], errors="coerce")))

def load_series_auto(path: str) -> Tuple[pd.Series, str, str]:
    if path.lower().endswith(".csv"):
        df = pd.read_csv(path)
    else:
        df = pd.read_excel(path, sheet_name=0, engine="openpyxl")

    date_col = guess_datetime_col(df)
    if date_col is None:
        raise ValueError("Não consegui detectar a coluna de datas. Informe uma no arquivo ou converta previamente.")

    target_col = guess_target_col(df, date_col)
    s = df[[date_col, target_col]].copy()
    s[date_col] = pd.to_datetime(s[date_col])
    s = s.dropna().sort_values(date_col).set_index(date_col)[target_col].astype(float)

    # Regularização opcional
    freq = CFG["freq"]
    if freq is None:
        try: freq = pd.infer_freq(s.index)
        except: freq = None
    if CFG["force_regular"] and freq:
        idx = pd.date_range(s.index.min(), s.index.max(), freq=freq)
        s = s.reindex(idx).interpolate(limit_direction="both")

    return s, date_col, target_col

y_raw, detected_date_col, detected_target_col = load_series_auto(DATA_PATH)
print(f"Coluna de data detectada: {detected_date_col}")
print(f"Coluna alvo detectada:    {detected_target_col}")
print(f"N observações: {len(y_raw)}")
y_raw.head()


Coluna de data detectada: Date and Hour
Coluna alvo detectada:    Production
N observações: 1439


Date and Hour
2020-01-01 00:00:00+01:00    0.0
2020-01-01 01:00:00+01:00    0.0
2020-01-01 02:00:00+01:00    0.0
2020-01-01 03:00:00+01:00    0.0
2020-01-01 04:00:00+01:00    0.0
Name: Production, dtype: float64

5) Transformações, testes básicos e split 50/25/25

In [17]:
# %%
def apply_log_if_needed(y: pd.Series):
    if CFG["apply_log"]:
        return np.log1p(y.clip(lower=0)), "log1p"
    return y.copy(), None

def invert_transform(pred, trans):
    if trans == "log1p": return np.expm1(pred)
    return pred

def time_split(y: pd.Series, ratios=(0.5,0.25,0.25)):
    n = len(y)
    n_tr = int(n*ratios[0]); n_val = int(n*ratios[1])
    y_tr = y.iloc[:n_tr]
    y_va = y.iloc[n_tr:n_tr+n_val]
    y_te = y.iloc[n_tr+n_val:]
    return y_tr, y_va, y_te

y, trans = apply_log_if_needed(y_raw)
y_train, y_val, y_test = time_split(y, CFG["split_ratios"])

print(f"Split -> train={len(y_train)}, val={len(y_val)}, test={len(y_test)}")

Split -> train=719, val=359, test=361


6) ARIMA/SARIMA (Box–Jenkins)

In [18]:
# %%
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox

class ArimaPick:
    def __init__(self, order, seas, aic, lb_pmin, model):
        self.order, self.seas = order, seas
        self.aic, self.lb_pmin, self.model = aic, lb_pmin, model

def fit_arima_grid(y_tr: pd.Series, s: Optional[int]):
    best = None
    for (p,d,q) in CFG["arima_pdq_grid"]:
        for (P,D,Q) in (CFG["arima_PDQ_grid"] if s else [(0,0,0)]):
            seas = (P,D,Q,s) if s else (0,0,0,0)
            try:
                m = SARIMAX(y_tr, order=(p,d,q), seasonal_order=seas).fit(disp=False)
                resid = m.resid.dropna()
                lb = acorr_ljungbox(resid, lags=[10,15,20], return_df=True)["lb_pvalue"].min()
                pick = ArimaPick((p,d,q), seas, float(m.aic), float(lb), m)
                if (best is None) or (pick.aic < best.aic - 1e-6) or \
                   (abs(pick.aic - best.aic) < 1e-6 and pick.lb_pmin > best.lb_pmin):
                    best = pick
            except:
                continue
    if best is None: raise RuntimeError("ARIMA/SARIMA não encontrado.")
    return best

def arima_walk(res, y_next: pd.Series):
    preds=[]
    for yt in y_next.values:
        pm = res.get_forecast(steps=1).predicted_mean.iloc[-1]
        preds.append(pm)
        res = res.append(endog=[yt], refit=False)
    return np.array(preds)

s = CFG["seasonal_period"]
ar_best = fit_arima_grid(y_train, s)
pred_train_in = ar_best.model.get_prediction().predicted_mean.loc[y_train.index].values
pred_val_ar = arima_walk(ar_best.model, y_val)
pred_test_ar = arima_walk(ar_best.model, y_test)

# escala original p/ métricas
ytr_p = invert_transform(y_train.values, trans)
yva_p = invert_transform(y_val.values, trans)
yte_p = invert_transform(y_test.values, trans)
ar_tr_p = invert_transform(pred_train_in, trans)
ar_va_p = invert_transform(pred_val_ar, trans)
ar_te_p = invert_transform(pred_test_ar, trans)

ar_metrics = {
    "train": {"mse": mse(ytr_p, ar_tr_p), "mape": mape(ytr_p, ar_tr_p)},
    "val":   {"mse": mse(yva_p, ar_va_p), "mape": mape(yva_p, ar_va_p)},
    "test":  {"mse": mse(yte_p, ar_te_p), "mape": mape(yte_p, ar_te_p)},
    "order": ar_best.order, "seasonal": ar_best.seas, "aic": ar_best.aic, "lb_pmin": ar_best.lb_pmin
}
print(ar_metrics)

plot_fit(y_train.index, y_train.values, pred_train_in, f"ARIMA {ar_best.order} {ar_best.seas} - Treino", "arima_train")
plot_fit(y_val.index, y_val.values, pred_val_ar, "ARIMA - Validação", "arima_val")
plot_fit(y_test.index, y_test.values, pred_test_ar, "ARIMA - Teste", "arima_test")


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


{'train': {'mse': 10732.015894803631, 'mape': 57511259964.18698}, 'val': {'mse': 19206.03420160553, 'mape': 126794100843.13876}, 'test': {'mse': 21511.921780258075, 'mape': 112705459456.25873}, 'order': (2, 1, 1), 'seasonal': (0, 0, 0, 0), 'aic': 8712.682757856215, 'lb_pmin': 1.3479302194018126e-25}


7) KNN (AM clássico) – lags + rolling

In [19]:
# %%
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid

def make_supervised(y: pd.Series, max_lag: int, rolling: Optional[int]):
    df = pd.DataFrame({"y": y})
    for L in range(1, max_lag+1):
        df[f"lag_{L}"] = df["y"].shift(L)
    if rolling and rolling>1:
        df[f"roll_mean_{rolling}"] = df["y"].shift(1).rolling(rolling).mean()
        df[f"roll_std_{rolling}"]  = df["y"].shift(1).rolling(rolling).std()
    return df.dropna()

def split_sup(df_sup, idx_splits):
    it_tr, it_val, it_te = idx_splits
    X, y = df_sup.drop(columns=["y"]), df_sup["y"]
    Xtr = X.loc[X.index.intersection(it_tr)]; ytr = y.loc[Xtr.index]
    Xva = X.loc[X.index.intersection(it_val)]; yva = y.loc[Xva.index]
    Xte = X.loc[X.index.intersection(it_te)]; yte = y.loc[Xte.index]
    return Xtr, ytr, Xva, yva, Xte, yte

def knn_search(y: pd.Series, idx_splits):
    best={"score": math.inf}
    for lags in CFG["lags_grid"]:
        for roll in CFG["rolling_feats"]:
            sup = make_supervised(y, lags, roll)
            Xtr,ytr,Xva,yva,Xte,yte = split_sup(sup, idx_splits)
            if len(Xva)==0 or len(Xte)==0: continue
            grid = ParameterGrid({
                "n_neighbors": CFG["knn_n_neighbors"],
                "weights": CFG["knn_weights"],
                "p": CFG["knn_p"],
            })
            for params in grid:
                model = Pipeline([
                    ("scaler", StandardScaler()),
                    ("knn", KNeighborsRegressor(**params))
                ])
                model.fit(Xtr, ytr)
                p_tr = model.predict(Xtr); p_va = model.predict(Xva); p_te = model.predict(Xte)
                score = mse(yva, p_va)
                if score < best["score"]:
                    best = {
                        "score": score, "lags": lags, "rolling": roll, "params": params,
                        "preds": {"train": p_tr, "val": p_va, "test": p_te},
                        "truth": {"train": ytr, "val": yva, "test": yte},
                        "model": model
                    }
    if best["score"]==math.inf: raise RuntimeError("KNN não válido.")
    return best

idx_splits = (y_train.index, y_val.index, y_test.index)
knn_best = knn_search(y, idx_splits)

plot_fit(knn_best["truth"]["train"].index, knn_best["truth"]["train"].values, knn_best["preds"]["train"],
         "KNN - Treino", "knn_train")
plot_fit(knn_best["truth"]["val"].index, knn_best["truth"]["val"].values, knn_best["preds"]["val"],
         "KNN - Validação", "knn_val")
plot_fit(knn_best["truth"]["test"].index, knn_best["truth"]["test"].values, knn_best["preds"]["test"],
         "KNN - Teste", "knn_test")


8) VGG-1D (AP) – CNN estilo VGG para séries

In [26]:
# === IMPORTS DIRETOS ===
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Dense, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error as mse

In [27]:
def build_windows(y: pd.Series, window: int):
    arr = y.values.astype("float32")
    X, Y, idx = [], [], []
    for i in range(window, len(arr)):
        X.append(arr[i-window:i]); Y.append(arr[i]); idx.append(y.index[i])
    X = np.array(X).reshape(-1, window, 1); Y = np.array(Y); idx = pd.DatetimeIndex(idx)
    return X, Y, idx

def vgg1d_block(model, Conv1D, filters):
    model.add(Conv1D(filters, 3, padding="same", activation="relu"))
    model.add(Conv1D(filters, 3, padding="same", activation="relu"))
    model.add(MaxPooling1D(2))

def vgg_search(y: pd.Series, idx_splits):
    if not CFG["use_vgg"]:
        return {"disabled": True}
    tfp = try_import_tf()
    if tfp is None:
        return {"disabled": True, "reason": "TensorFlow não encontrado (instale tensorflow>=2.12)."}
    tf, Sequential, Conv1D, MaxPooling1D, Dropout, Dense, Flatten, EarlyStopping = tfp

    best={"score": math.inf}
    for window in CFG["vgg_windows"]:
        X, Y, idx_all = build_windows(y, window)
        it_tr, it_val, it_te = idx_splits
        it_tr2 = idx_all.intersection(it_tr); it_val2 = idx_all.intersection(it_val); it_te2 = idx_all.intersection(it_te)
        def take(ix):
            m = np.isin(idx_all, ix)
            return X[m], Y[m]
        Xtr, ytr = take(it_tr2); Xva, yva = take(it_val2); Xte, yte = take(it_te2)
        if len(Xva)==0 or len(Xte)==0: continue

        for f0 in CFG["vgg_filters"]:
            for dr in CFG["vgg_dropout"]:
                model = Sequential()
                vgg1d_block(model, Conv1D, f0)
                vgg1d_block(model, Conv1D, f0*2)
                vgg1d_block(model, Conv1D, f0*2)
                if dr>0: model.add(Dropout(dr))
                model.add(Flatten()); model.add(Dense(64, activation="relu"))
                model.add(Dense(1))
                model.compile(optimizer="adam", loss="mse")
                es = EarlyStopping(monitor="val_loss", patience=CFG["vgg_patience"],
                                   restore_best_weights=True, verbose=0)
                model.fit(Xtr, ytr, validation_data=(Xva, yva),
                          epochs=CFG["vgg_epochs"], batch_size=CFG["vgg_batch"], verbose=0, callbacks=[es])
                p_tr = model.predict(Xtr, verbose=0).ravel()
                p_va = model.predict(Xva, verbose=0).ravel()
                p_te = model.predict(Xte, verbose=0).ravel()
                score = mse(yva, p_va)
                if score < best["score"]:
                    best = {"score": score, "window": window, "filters": f0, "dropout": dr,
                            "preds": {"train": p_tr, "val": p_va, "test": p_te},
                            "truth": {"train": ytr, "val": yva, "test": yte},
                            "idx": {"train": it_tr2, "val": it_val2, "test": it_te2}}
    if best["score"]==math.inf:
        return {"disabled": True, "reason": "Sem janela VGG válida."}
    return best

vgg_best = vgg_search(y, idx_splits)
if not vgg_best.get("disabled", False):
    # gráficos usando os índices do VGG
    idv = vgg_best["idx"]
    plot_fit(idv["train"], vgg_best["truth"]["train"], vgg_best["preds"]["train"], "VGG-1D - Treino", "vgg_train")
    plot_fit(idv["val"],   vgg_best["truth"]["val"],   vgg_best["preds"]["val"],   "VGG-1D - Validação", "vgg_val")
    plot_fit(idv["test"],  vgg_best["truth"]["test"],  vgg_best["preds"]["test"],  "VGG-1D - Teste", "vgg_test")
else:
    print(vgg_best.get("reason"))


9) Modelos Híbridos (Residual Stacking + Ensemble)

In [34]:
# %% [markdown]
# ## Setor 9 — Sistema Híbrido (Residual Stacking + Ensemble)
# Combina previsões do ARIMA, KNN e VGG para melhorar o desempenho final.
# Inclui correções automáticas de alinhamento entre séries com janelas diferentes.

from sklearn.ensemble import GradientBoostingRegressor

# === Função de alinhamento entre modelos base ===
def align_for_ensemble(ar_val, ar_test, knn_best, vgg_best):
    cand = {"arima": {"val": ar_val, "test": ar_test}}
    cand["knn"] = {"val": knn_best["preds"]["val"], "test": knn_best["preds"]["test"]}
    if not vgg_best.get("disabled", False):
        cand["vgg"] = {"val": vgg_best["preds"]["val"], "test": vgg_best["preds"]["test"]}
    return cand


# === Função do modelo híbrido Residual Stacking ===
def residual_stacking(y_val, y_test, base_val, base_test, extra_dict):
    """
    Modelo híbrido por empilhamento residual (Residual Stacking).
    Treina um regressor para aprender os resíduos do modelo base (ARIMA),
    usando previsões adicionais (KNN, VGG) como features.
    """

    # --- Criação dos DataFrames ---
    Xv = pd.DataFrame({k: v for k, v in extra_dict.items()})
    Xt = pd.DataFrame({k: v for k, v in extra_dict.items()})

    # --- Alinhamento automático de tamanhos ---
    min_len_v = min(len(y_val), *[len(v) for v in extra_dict.values()], len(base_val))
    y_val = y_val.iloc[-min_len_v:]
    base_val = base_val[-min_len_v:]
    Xv = Xv.iloc[-min_len_v:]

    min_len_t = min(len(y_test), *[len(v) for v in extra_dict.values()], len(base_test))
    y_test = y_test.iloc[-min_len_t:]
    base_test = base_test[-min_len_t:]
    Xt = Xt.iloc[-min_len_t:]

    # --- Treinamento do modelo de resíduos ---
    res_val = y_val.values - base_val
    gbr = GradientBoostingRegressor(random_state=RANDOM_STATE)
    gbr.fit(Xv, res_val)

    # --- Predição dos resíduos e reconstrução final ---
    res_te = gbr.predict(Xt)
    out = base_test[-len(res_te):] + res_te
    return out


# === Função de busca de pesos para Ensemble ===
def ensemble_search(y_val, y_test, cand):
    names = list(cand.keys())
    step = CFG["ensemble_weight_step"]
    best = {"score": math.inf}

    if len(names) == 2:
        wgrid = np.arange(0, 1 + step, step)
        for w in wgrid:
            pv = w * cand[names[0]]["val"] + (1 - w) * cand[names[1]]["val"]
            sc = mse(y_val.values, pv)
            if sc < best["score"]:
                pt = w * cand[names[0]]["test"] + (1 - w) * cand[names[1]]["test"]
                best = {"score": sc,
                        "weights": {names[0]: float(w), names[1]: float(1 - w)},
                        "test_pred": pt}

    elif len(names) == 3:
        wgrid = np.arange(0, 1 + step, step)
        for w1 in wgrid:
            for w2 in wgrid:
                w3 = 1 - w1 - w2
                if w3 < -1e-9:
                    continue
                pv = (w1 * cand[names[0]]["val"] +
                      w2 * cand[names[1]]["val"] +
                      w3 * cand[names[2]]["val"])
                sc = mse(y_val.values, pv)
                if sc < best["score"]:
                    pt = (w1 * cand[names[0]]["test"] +
                          w2 * cand[names[1]]["test"] +
                          w3 * cand[names[2]]["test"])
                    best = {"score": sc,
                            "weights": {names[0]: float(w1),
                                        names[1]: float(w2),
                                        names[2]: float(w3)},
                            "test_pred": pt}
    else:
        raise ValueError("Ensemble implementado apenas para 2 ou 3 modelos.")

    return best


# === Execução do sistema híbrido ===
cand = align_for_ensemble(ar_va_p, ar_te_p, knn_best, vgg_best if not vgg_best.get("disabled", False) else {})
hybrid = {}

# --- Residual Stacking ---
if CFG["hybrid_residual"]:
    extra = {"knn": cand["knn"]["val"]}
    if "vgg" in cand:
        extra["vgg"] = cand["vgg"]["val"]

    rs_test = residual_stacking(pd.Series(yva_p, index=y_val.index),
                                pd.Series(yte_p, index=y_test.index),
                                cand["arima"]["val"], cand["arima"]["test"], extra)

    # alinhamento final para métricas
    min_len_eval = min(len(yte_p), len(rs_test))
    yte_aligned = yte_p[-min_len_eval:]
    rs_aligned = rs_test[-min_len_eval:]

    hybrid["residual"] = {
        "test_pred": rs_aligned,
        "mse": mse(yte_aligned, rs_aligned),
        "mape": mape(yte_aligned, rs_aligned)
    }

    plot_fit(y_test.index[-min_len_eval:], yte_aligned, rs_aligned,
             "Híbrido (Residual Stacking) - Teste", "hybrid_residual_test")


# --- Ensemble de modelos ---
if CFG["hybrid_ensemble"]:
    ens = ensemble_search(pd.Series(yva_p, index=y_val.index),
                          pd.Series(yte_p, index=y_test.index), cand)

    min_len_eval = min(len(yte_p), len(ens["test_pred"]))
    yte_aligned = yte_p[-min_len_eval:]
    ens_aligned = ens["test_pred"][-min_len_eval:]

    hybrid["ensemble"] = {
        "weights": ens["weights"],
        "mse": mse(yte_aligned, ens_aligned),
        "mape": mape(yte_aligned, ens_aligned),
        "test_pred": ens_aligned
    }

    plot_fit(y_test.index[-min_len_eval:], yte_aligned, ens_aligned,
             "Híbrido (Ensemble) - Teste", "hybrid_ensemble_test")

hybrid

{'residual': {'test_pred': array([-3.49274249e+00, -3.48417283e+00, -3.48610018e+00, -3.48566671e+00,
         -3.48576420e+00, -3.48574227e+00, -3.48574720e+00,  1.49283135e+01,
          2.23018523e+01,  7.34581029e+02,  2.65583523e+03,  2.71942760e+03,
          2.96924873e+03,  3.10993918e+03,  2.71601893e+03,  2.01190206e+03,
          1.11999549e+03,  8.52406565e+01, -9.83975632e+01,  3.11134079e+02,
          5.00610663e+01, -7.20857161e+01,  1.12884986e+00, -4.52358578e+00,
         -1.43033644e+00, -3.53824193e+00, -3.47393984e+00, -3.48840161e+00,
         -3.48514911e+00, -3.48588061e+00, -3.48571609e+00,  1.49283065e+01,
          1.73861612e+01,  3.02567036e+02,  1.19109318e+03,  1.57461905e+03,
          2.00074428e+03,  1.79603009e+03,  1.73348392e+03,  1.39818102e+03,
          7.54271298e+02,  2.99223954e+02, -1.28895803e+02,  2.25262105e+02,
          1.63222819e+01, -1.86345563e+01, -7.87237581e-02, -4.25199807e+00,
         -3.31341348e+00, -3.52450457e+00, -3.47702

10) Relatório consolidado + apresentação

In [38]:
# %% [markdown]
# ## Setor 10 — Relatório de Métricas e Exportação (CSV + Markdown)

rows = [
    ("ARIMA",
     ar_metrics["train"]["mse"], ar_metrics["train"]["mape"],
     ar_metrics["val"]["mse"], ar_metrics["val"]["mape"],
     ar_metrics["test"]["mse"], ar_metrics["test"]["mape"]),

    ("KNN",
     mse(knn_best["truth"]["train"], knn_best["preds"]["train"]),
     mape(knn_best["truth"]["train"], knn_best["preds"]["train"]),
     mse(knn_best["truth"]["val"], knn_best["preds"]["val"]),
     mape(knn_best["truth"]["val"], knn_best["preds"]["val"]),
     mse(knn_best["truth"]["test"], knn_best["preds"]["test"]),
     mape(knn_best["truth"]["test"], knn_best["preds"]["test"])),
]

# --- Adiciona VGG, se ativo ---
if not vgg_best.get("disabled", False):
    rows.append((
        f"VGG1D(w={vgg_best['window']}, f={vgg_best['filters']}, d={vgg_best['dropout']})",
        mse(vgg_best["truth"]["train"], vgg_best["preds"]["train"]),
        mape(vgg_best["truth"]["train"], vgg_best["preds"]["train"]),
        mse(vgg_best["truth"]["val"], vgg_best["preds"]["val"]),
        mape(vgg_best["truth"]["val"], vgg_best["preds"]["val"]),
        mse(vgg_best["truth"]["test"], vgg_best["preds"]["test"]),
        mape(vgg_best["truth"]["test"], vgg_best["preds"]["test"]),
    ))

# --- Adiciona híbridos (Residual e Ensemble, se existirem) ---
for k, v in hybrid.items():
    rows.append((
        f"HÍBRIDO-{k.upper()}",
        None, None, None, None,  # híbridos só avaliam no teste
        v.get("mse", np.nan),
        v.get("mape", np.nan)
    ))

# --- Monta DataFrame final ---
report = pd.DataFrame(rows, columns=[
    "Modelo", "MSE_train", "MAPE_train",
    "MSE_val", "MAPE_val", "MSE_test", "MAPE_test"
])

# exibe em notebook
display(
    report.style.format(
        lambda v: f"{v:.4f}" if isinstance(v, (int, float, np.floating)) and pd.notnull(v) else "-"
    ).set_caption("Resumo comparativo de modelos")
)

# --- Salva CSV ---
os.makedirs(OUT_DIR, exist_ok=True)
report.to_csv(os.path.join(OUT_DIR, "metrics_summary.csv"), index=False)

# --- Gera relatório Markdown ---
md = [
    "# Projeto – Análise de Séries Temporais e Regressão",
    "## Série utilizada",
    f"- Arquivo: `{os.path.basename(DATA_PATH)}`",
    f"- Coluna temporal detectada: `{detected_date_col}`",
    f"- Coluna alvo detectada: `{detected_target_col}`",
    "## Metodologia",
    "- Split temporal: **50% / 25% / 25%** (ordem preservada)",
    "- **ARIMA/SARIMA (Box–Jenkins)** – Seleção via AIC e teste de Ljung–Box",
    "- **KNN** – Lags + estatísticas móveis; parâmetros ajustados por validação",
    "- **VGG-1D** – CNN 1D com janelas deslizantes; early stopping via validação",
    "- **Híbridos** – Residual Stacking e Ensemble ponderado (pesos via validação)",
    "## Comparação (MSE / MAPE)",
    report.to_markdown(index=False)
]

with open(os.path.join(OUT_DIR, "presentation.md"), "w", encoding="utf-8") as f:
    f.write("\n\n".join(md))

print("✅ Relatórios gerados em:", os.path.abspath(OUT_DIR))

Unnamed: 0,Modelo,MSE_train,MAPE_train,MSE_val,MAPE_val,MSE_test,MAPE_test
0,-,10732.0159,57511259964.1870,19206.0342,126794100843.1388,21511.9218,112705459456.2587
1,-,22.5075,5417256013.2430,49825.9291,4891943272.8344,144299.9599,1092183326.9251
2,-,2457.5396,18244069376.0000,8206.3711,20543780864.0000,23071.3789,21014245376.0
3,-,-,-,-,-,28466.4976,145634499899.8284
4,-,-,-,-,-,13460.558,44136515304.4726


✅ Relatórios gerados em: C:\Users\Felipe Oliveira\Desktop\CIÊNCIA DE DADOS\SÉRIES TEMPORAIS\PROJETO DA DISCIPLINA\out
