## **RQ2**

> _Um modelo treinado em m√∫ltiplas c√©lulas espaciais generaliza melhor para c√©lulas n√£o vistas do que modelos treinados localmente?_

Para garantir rigor metodol√≥gico e evitar vazamentos espaciais, a generaliza√ß√£o ser√° avaliada por protocolos explicitamente blindados, incluindo:
* Split por blocos espaciais, evitando treino e teste em c√©lulas geograficamente adjacentes;
* Dois cen√°rios complementares:
    * Leave-region-out: regi√µes inteiras s√£o exclu√≠das do treino e usadas apenas para teste;
    * Leave-cell-out: c√©lulas individuais n√£o vistas s√£o usadas para teste, respeitando separa√ß√£o espacial m√≠nima.
* Esses protocolos permitem distinguir entre interpola√ß√£o espacial e generaliza√ß√£o genu√≠na.

## Libraries

In [23]:
import os
os.environ['NIXTLA_ID_AS_COL'] = '1'

import optuna
import itertools
import shutil
import time
import functools
import gc
import requests

import pandas as pd
import numpy as np
np.random.seed(1)

import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots
import plotly.io as pio
from graphmodex import plotlymodex
pio.renderers.default = 'notebook'

import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib as mpl

import joblib
import pickle
from IPython.display import clear_output

In [24]:
import neuralforecast
import mlforecast
import statsforecast
import utilsforecast
import coreforecast

from statsforecast import StatsForecast
from statsforecast.models import (
    Naive, SeasonalNaive, 
    AutoARIMA, AutoCES, AutoETS, AutoTheta,
)

from mlforecast import MLForecast
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

from neuralforecast import NeuralForecast
from neuralforecast.models import (
    NBEATS, NHITS,
    GRU, Informer, LSTM
)
from neuralforecast.losses.pytorch import MSE, SMAPE, MAE

from mlforecast.utils import PredictionIntervals

from pytorch_lightning import Trainer
trainer = Trainer(
    max_steps=4,
    logger=False,
    enable_progress_bar=False,
    enable_model_summary=False
)

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="optuna")


# ==================================================
# REPRODUCTIBILITY
# ==================================================
import random
import torch

SEED = 1
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
üí° Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.


### Results Storage

In [25]:
from pathlib import Path

BASE_RESULTS = Path("Results/RQ2")
BASE_RESULTS.mkdir(parents=True, exist_ok=True)

def save_results(df, model_family, pollutant, horizon_label, fold=None):

    if fold is not None:
        filename = f"{model_family}_{pollutant}_{fold}_{horizon_label}.parquet"
    else:
        filename = f"{model_family}_{pollutant}_{horizon_label}.parquet"


    out_dir = BASE_RESULTS / model_family
    out_dir.mkdir(parents=True, exist_ok=True)

    fname = f"{pollutant}_{horizon_label}.csv"
    df.to_csv(out_dir / fname, index=False)

## Data

In [26]:
# ===============================
# DATA
# ===============================
df = pd.read_parquet(r'..\Data\CAMS\processed\eac4_era5_2010_2024_brasil_enhanced.parquet')


# ===============================
# COASTAL ADJUSTMENT
# ===============================
cells = df[['unique_id', 'latitude', 'longitude', 'is_coastal']].drop_duplicates()

coastal_cells = cells[cells['is_coastal'] == True]
interior_cells = cells[cells['is_coastal'] == False]

def min_distance_to_coast(row, coastal_coords):
    dists = np.sqrt(
        (coastal_coords[:,0] - row['latitude'])**2 +
        (coastal_coords[:,1] - row['longitude'])**2
    )
    return dists.min()

coastal_coords = coastal_cells[['latitude','longitude']].values

interior_cells['dist_to_coast'] = interior_cells.apply(
    lambda row: min_distance_to_coast(row, coastal_coords),
    axis=1
)

buffer_threshold = 1.5

deep_interior = interior_cells[
    interior_cells['dist_to_coast'] > buffer_threshold
]

train_ids = deep_interior['unique_id']
test_ids = coastal_cells['unique_id']

df['train_coastal'] = df['unique_id'].isin(train_ids).astype(int)
df['test_coastal']  = df['unique_id'].isin(test_ids).astype(int)


# ===============================
# POLLUTANTS
# ===============================
# pm10 = (
#     df
#     .copy()
#     .rename(columns={
#         'pm10': 'y',
#         'valid_time': 'ds'        
#     })
#     [['unique_id', 'ds', 'y', 
#       'latitude', 'longitude', 'state', 
#       'name_region', 'test_coastal', 
#       'train_coastal',]]
# )

pm2p5 = (
    df
    .copy()
    .rename(columns={
        'pm2p5': 'y',
        'valid_time': 'ds'        
    })
    [['unique_id', 'ds', 'y', 
      'latitude', 'longitude', 'state', 
      'name_region', 'test_coastal', 
      'train_coastal',]]
)

go3 = (
    df
    .copy()
    .rename(columns={
        'go3': 'y',
        'valid_time': 'ds'        
    })
    [['unique_id', 'ds', 'y', 
      'latitude', 'longitude', 'state', 
      'name_region', 'test_coastal', 
      'train_coastal',]]
)

# no2 = (
#     df
#     .copy()
#     .rename(columns={
#         'no2': 'y',
#         'valid_time': 'ds'        
#     })
#     [['unique_id', 'ds', 'y', 
#       'latitude', 'longitude', 'state', 
#       'name_region', 'test_coastal', 
#       'train_coastal',]]
# )

In [27]:
pollutant_dict = {
    'go3': {
        'sudeste': {
            'train_df': go3.query("name_region != 'Sudeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': go3.query("name_region == 'Sudeste'")[['unique_id', 'ds', 'y']],
        },
        'sul': {
            'train_df': go3.query("name_region != 'Sul' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': go3.query("name_region == 'Sul'")[['unique_id', 'ds', 'y']],
        },
        'centro_oeste': {
            'train_df': go3.query("name_region != 'Centro Oeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': go3.query("name_region == 'Centro Oeste'")[['unique_id', 'ds', 'y']],
        },
        'coastal': {
            'train_df': go3.query("train_coastal == 1 and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': go3.query("test_coastal == 1")[['unique_id', 'ds', 'y']],
        },
        'scaler': 1e8,
    },
    # 'no2': {
    #     'sudeste': {
    #         'train_df': no2.query("name_region != 'Sudeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': no2.query("name_region == 'Sudeste'")[['unique_id', 'ds', 'y']],
    #     },
    #     'sul': {
    #         'train_df': no2.query("name_region != 'Sul' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': no2.query("name_region == 'Sul'")[['unique_id', 'ds', 'y']],
    #     },
    #     'centro_oeste': {
    #         'train_df': no2.query("name_region != 'Centro Oeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': no2.query("name_region == 'Centro Oeste'")[['unique_id', 'ds', 'y']],
    #     },
    #     'coastal': {
    #         'train_df': no2.query("train_coastal == 1 and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': no2.query("test_coastal == 1")[['unique_id', 'ds', 'y']],
    #     },
    #     'scaler': 1e10,
    # },
    # 'pm10': {
    #     'sudeste': {
    #         'train_df': pm10.query("name_region != 'Sudeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': pm10.query("name_region == 'Sudeste'")[['unique_id', 'ds', 'y']],
    #     },
    #     'sul': {
    #         'train_df': pm10.query("name_region != 'Sul' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': pm10.query("name_region == 'Sul'")[['unique_id', 'ds', 'y']],
    #     },
    #     'centro_oeste': {
    #         'train_df': pm10.query("name_region != 'Centro Oeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': pm10.query("name_region == 'Centro Oeste'")[['unique_id', 'ds', 'y']],
    #     },
    #     'coastal': {
    #         'train_df': pm10.query("train_coastal == 1 and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
    #         'test_df': pm10.query("test_coastal == 1")[['unique_id', 'ds', 'y']],
    #     },
    #     'scaler': 1e9,
    # },
    'pm2p5': {
        'sudeste': {
            'train_df': pm2p5.query("name_region != 'Sudeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': pm2p5.query("name_region == 'Sudeste'")[['unique_id', 'ds', 'y']],
        },
        'sul': {
            'train_df': pm2p5.query("name_region != 'Sul' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': pm2p5.query("name_region == 'Sul'")[['unique_id', 'ds', 'y']],
        },
        'centro_oeste': {
            'train_df': pm2p5.query("name_region != 'Centro Oeste' and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': pm2p5.query("name_region == 'Centro Oeste'")[['unique_id', 'ds', 'y']],
        },
        'coastal': {
            'train_df': pm2p5.query("train_coastal == 1 and ds <= '2022-12-31'")[['unique_id', 'ds', 'y']],
            'test_df': pm2p5.query("test_coastal == 1")[['unique_id', 'ds', 'y']],
        },
        'scaler': 1e9,
    },
}

### Setup

In [28]:
# ===============================
# CONFIG
# ===============================
steps_per_day = 8
two_year_steps = 2 * 365 * steps_per_day
target_windows = 30

FREQ = '3h'
SEASON_LENGTH = 8 

experiments_dict = {
    # '1 days': {
    #     'horizon': 8*1,
    #     'step_size': max(8*1, two_year_steps // target_windows),
    #     'windows': target_windows,
    # },
    '7 days': {
        'horizon': 8*7,
        'step_size': max(8*7, two_year_steps // target_windows),
        'windows': target_windows,
    },
    # '14 days': {
    #     'horizon': 8*14,
    #     'step_size': max(8*14, two_year_steps // target_windows),
    #     'windows': target_windows,
    # },
    '30 days': {
        'horizon': 8*30,
        'step_size': 8*30,
        'windows': two_year_steps // (8*30),  # 24
    },
}

# **Models**

### Neural Forecasters

In [None]:
for pollutant_name, pollutant_dict in pollutant_dict.items():

    scaler = pollutant_dict['scaler']

    for fold_name, fold_data in pollutant_dict.items():

        if fold_name == 'scaler':
            continue

        train_df = fold_data['train_df'].copy()
        test_df  = fold_data['test_df'].copy()

        # Escalar
        train_df['y'] *= scaler
        test_df['y']  *= scaler

        # Ordenar
        train_df = train_df.sort_values(['unique_id','ds'])
        test_df  = test_df.sort_values(['unique_id','ds'])

        for horizon_label, exp_cfg in experiments_dict.items():

            try:

                print(f"Running RQ2 | {pollutant_name} | {fold_name} | {horizon_label}")

                # -------------------------------
                # MODELS
                # -------------------------------
                models = [
                    NBEATS(
                        h=exp_cfg['horizon'],
                        input_size=14*8,
                        stack_types=["identity", "trend", "seasonality"],
                        n_blocks=[1, 1, 1],
                        mlp_units=3 * [[256, 256]],
                        basis='polynomial',
                        n_basis=2,
                        n_harmonics=2,
                        shared_weights=True,
                        activation='ReLU',
                        max_steps=1000,
                        learning_rate=1e-3,
                        batch_size=32,
                        windows_batch_size=1024,
                        random_seed=SEED,
                        alias='NBEATS-I',
                        loss=MAE(),
                    ),
                    NBEATS(
                        h=exp_cfg['horizon'],
                        input_size=14*8,
                        stack_types=['identity'] * 3,
                        n_blocks=[2, 2, 2],
                        mlp_units=3 * [[256, 256]],
                        shared_weights=False,
                        activation='ReLU',
                        max_steps=1000,
                        learning_rate=1e-3,
                        batch_size=32,
                        windows_batch_size=1024,
                        random_seed=SEED,
                        alias='NBEATS-G',
                        logger=False,
                        loss=MAE(),
                    ),
                    NHITS(
                        h=exp_cfg['horizon'],
                        input_size=14*8,
                        n_blocks=[1, 1, 1],
                        mlp_units=3 * [[256, 256]],
                        n_pool_kernel_size=[2, 2, 1],
                        n_freq_downsample=[4, 2, 1],
                        activation='ReLU',
                        dropout_prob_theta=0.1,
                        max_steps=1000,
                        learning_rate=1e-3,
                        batch_size=32,
                        windows_batch_size=1024,
                        random_seed=SEED,
                        alias='NHITS',
                        logger=False,
                        loss=MAE(),
                    ),
                ]

                nf = NeuralForecast(models=models, freq=FREQ)

                # -------------------------
                # FIT (APENAS TREINO)
                # -------------------------
                nf.fit(train_df)

                # -------------------------
                # GERAR CUTOFFS
                # -------------------------
                h = exp_cfg['horizon']
                step_size = exp_cfg['step_size']
                n_windows = exp_cfg['windows']

                start_test = pd.Timestamp('2023-01-01')
                end_test   = test_df['ds'].max()

                rq1_cutoffs = pd.read_csv(
                    rf'.\Results\RQ1\ml\go3_{horizon_label.replace(' ', '')}.csv'
                )['cutoff'].unique()

                rq1_cutoffs = pd.to_datetime(rq1_cutoffs)
                rq1_cutoffs = sorted(rq1_cutoffs)

                cutoffs = rq1_cutoffs[:n_windows]

                results_list = []

                # -------------------------
                # LOOP CUTOFFS
                # -------------------------
                for cutoff in cutoffs:

                    hist_test = test_df[test_df['ds'] <= cutoff]

                    preds = nf.predict(
                        h=h,
                        df=hist_test
                    )

                    freq_offset = pd.tseries.frequencies.to_offset(FREQ)

                    start_forecast = cutoff + freq_offset
                    end_forecast = cutoff + h * freq_offset   # <-- CORRETO

                    preds = preds[
                        (preds['ds'] >= start_forecast) &
                        (preds['ds'] <= end_forecast)
                    ]

                    real = test_df[
                        (test_df['ds'] >= start_forecast) &
                        (test_df['ds'] <= end_forecast)
                    ]

                    merged = preds.merge(real, on=['unique_id','ds'], how='left')
                    merged['cutoff'] = cutoff
                    merged['fold'] = fold_name

                    results_list.append(merged)

                results_ = pd.concat(results_list)

                # ---------------------------------------
                # Reescalar automaticamente todos modelos
                # ---------------------------------------
                non_model_cols = ['unique_id', 'ds', 'y', 'cutoff', 'fold']
                model_cols = [col for col in results_.columns if col not in non_model_cols]
                results_['y'] /= scaler

                for col in model_cols:
                    results_[col] /= scaler

                save_results(
                    results_,
                    model_family=f"neural",
                    pollutant=pollutant_name + f'_{fold_name}',
                    horizon_label=horizon_label.replace(" ",""),
                    fold=fold_name
                )

                clear_output(wait = True)

            except Exception as e:
                print(f"Erro em {pollutant_name} | {fold_name} | {horizon_label}")
                print(e)
                raise

### Deep Learners

In [None]:
for pollutant_name, pollutant_dict in pollutant_dict.items():

    scaler = pollutant_dict['scaler']

    for fold_name, fold_data in pollutant_dict.items():

        if fold_name == 'scaler':
            continue

        train_df = fold_data['train_df'].copy()
        test_df  = fold_data['test_df'].copy()

        # Escalar
        train_df['y'] *= scaler
        test_df['y']  *= scaler

        # Ordenar
        train_df = train_df.sort_values(['unique_id','ds'])
        test_df  = test_df.sort_values(['unique_id','ds'])

        for horizon_label, exp_cfg in experiments_dict.items():

            try:

                print(f"Running RQ2 | {pollutant_name} | {fold_name} | {horizon_label}")

                # -------------------------------
                # MODELS
                # -------------------------------
                models = [
                    LSTM(
                        h=exp_cfg['horizon'],
                        input_size=14*8,
                        max_steps=1000,  # Passos m√°ximos
                        learning_rate=1e-3,  # Taxa de aprendizado
                        batch_size=32,
                        windows_batch_size=1024,
                        random_seed=SEED,
                        alias='LSTM',  # Alias para identifica√ß√£o
                        loss=MAE(),  # Fun√ß√£o de perda
                        logger=False,
                    ),
                    GRU(
                        h=exp_cfg['horizon'],
                        input_size=14*8,
                        max_steps=1000,  # Passos m√°ximos
                        learning_rate=1e-3,  # Taxa de aprendizado
                        batch_size=32,
                        windows_batch_size=1024,
                        random_seed=SEED,
                        alias='GRU',  # Alias para identifica√ß√£o
                        loss=MAE(),  # Fun√ß√£o de perda
                        logger=False,
                    ),
                    # Informer(
                    #     h=exp_cfg['horizon'],
                    #     input_size=14*8,
                    #     max_steps=1500,  # Passos m√°ximos
                    #     learning_rate=1e-3,  # Taxa de aprendizado
                    #     batch_size=32,
                    #     windows_batch_size=1024,
                    #     random_seed=SEED,
                    #     alias='Informer',
                    #     loss=MAE(),  # Fun√ß√£o de perda
                    #     logger=False,
                    # )
                ]

                nf = NeuralForecast(models=models, freq=FREQ)

                # -------------------------
                # FIT (APENAS TREINO)
                # -------------------------
                nf.fit(train_df)

                # -------------------------
                # GERAR CUTOFFS
                # -------------------------
                h = exp_cfg['horizon']
                step_size = exp_cfg['step_size']
                n_windows = exp_cfg['windows']

                start_test = pd.Timestamp('2023-01-01')
                end_test   = test_df['ds'].max()

                rq1_cutoffs = pd.read_csv(
                    rf'.\Results\RQ1\ml\go3_{horizon_label.replace(' ', '')}.csv'
                )['cutoff'].unique()

                rq1_cutoffs = pd.to_datetime(rq1_cutoffs)
                rq1_cutoffs = sorted(rq1_cutoffs)

                cutoffs = rq1_cutoffs[:n_windows]

                results_list = []

                # -------------------------
                # LOOP CUTOFFS
                # -------------------------
                for cutoff in cutoffs:

                    hist_test = test_df[test_df['ds'] <= cutoff]

                    preds = nf.predict(
                        h=h,
                        df=hist_test
                    )

                    freq_offset = pd.tseries.frequencies.to_offset(FREQ)

                    start_forecast = cutoff + freq_offset
                    end_forecast = cutoff + h * freq_offset   # <-- CORRETO

                    preds = preds[
                        (preds['ds'] >= start_forecast) &
                        (preds['ds'] <= end_forecast)
                    ]

                    real = test_df[
                        (test_df['ds'] >= start_forecast) &
                        (test_df['ds'] <= end_forecast)
                    ]

                    merged = preds.merge(real, on=['unique_id','ds'], how='left')
                    merged['cutoff'] = cutoff
                    merged['fold'] = fold_name

                    results_list.append(merged)

                results_ = pd.concat(results_list)

                # ---------------------------------------
                # Reescalar automaticamente todos modelos
                # ---------------------------------------
                non_model_cols = ['unique_id', 'ds', 'y', 'cutoff', 'fold']
                model_cols = [col for col in results_.columns if col not in non_model_cols]
                results_['y'] /= scaler

                for col in model_cols:
                    results_[col] /= scaler

                save_results(
                    results_,
                    model_family=f"dl",
                    pollutant=pollutant_name + f'_{fold_name}',
                    horizon_label=horizon_label.replace(" ",""),
                    fold=fold_name
                )

                clear_output(wait = True)

            except Exception as e:
                continue

### Machine Learners

In [None]:
for pollutant_name, pollutant_dict in pollutant_dict.items():

    scaler = pollutant_dict['scaler']

    for fold_name, fold_data in pollutant_dict.items():

        if fold_name == 'scaler':
            continue

        train_df = fold_data['train_df'].copy()
        test_df  = fold_data['test_df'].copy()

        # Ordenar
        train_df = train_df.sort_values(['unique_id','ds'])
        test_df  = test_df.sort_values(['unique_id','ds'])

        for horizon_label, exp_cfg in experiments_dict.items():
            
            if (
                (pollutant_name == 'go3') and (fold_name == 'sudeste') 
            ) or (
                (pollutant_name == 'go3') and (fold_name == 'sul') and (horizon_label == '7 days')
            ):
                continue

            print(f"Running RQ2 | {pollutant_name} | {fold_name} | {horizon_label}")

            # -------------------------------
            # MODEL 
            # -------------------------------
            lgb_model = lgb.LGBMRegressor(
                n_estimators=500,
                learning_rate=0.05,
                num_leaves=31,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=SEED,
                n_jobs=3,
            )
            rf_model = RandomForestRegressor(
                n_estimators=500,
                max_depth=10,
                random_state=SEED,
                n_jobs=3,
            )
            xgb_model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=SEED,
                n_jobs=3,
            )

            # -------------------------------
            # MLForecast (lags + calendar)
            # -------------------------------
            mlf = MLForecast(
                models={
                    'RandomForest': rf_model,
                    'XGBoost': xgb_model,
                    'LightGBM': lgb_model,
                },
                freq=FREQ,
                lags=[1,2,4,8,16,24,56,112],
                lag_transforms={
                    8:  [RollingMean(8), ExpandingMean()],
                    56: [RollingMean(56)],
                },
                date_features=['hour','dayofweek','month','dayofyear'],
            )

            # -------------------------
            # FIT (APENAS TREINO)
            # -------------------------
            mlf.fit(train_df)

            # -------------------------
            # GERAR CUTOFFS
            # -------------------------
            h = exp_cfg['horizon']
            step_size = exp_cfg['step_size']
            n_windows = exp_cfg['windows']

            start_test = pd.Timestamp('2023-01-01')
            end_test   = test_df['ds'].max()

            rq1_cutoffs = pd.read_csv(
                rf'.\Results\RQ1\ml\go3_{horizon_label.replace(' ', '')}.csv'
            )['cutoff'].unique()

            rq1_cutoffs = pd.to_datetime(rq1_cutoffs)
            rq1_cutoffs = sorted(rq1_cutoffs)

            cutoffs = rq1_cutoffs[:n_windows]

            results_list = []

            # -------------------------
            # LOOP CUTOFFS
            # -------------------------
            for cutoff in cutoffs:

                hist_test = test_df[test_df['ds'] <= cutoff]

                preds = mlf.predict(
                    h=h,
                    new_df=hist_test
                )

                freq_offset = pd.tseries.frequencies.to_offset(FREQ)

                start_forecast = cutoff + freq_offset
                end_forecast = cutoff + h * freq_offset   # <-- CORRETO

                preds = preds[
                    (preds['ds'] >= start_forecast) &
                    (preds['ds'] <= end_forecast)
                ]

                real = test_df[
                    (test_df['ds'] >= start_forecast) &
                    (test_df['ds'] <= end_forecast)
                ]

                merged = preds.merge(real, on=['unique_id','ds'], how='left')
                merged['cutoff'] = cutoff
                merged['fold'] = fold_name

                results_list.append(merged)

            results_ml = pd.concat(results_list)

            save_results(
                results_ml,
                model_family=f"ml",
                pollutant=pollutant_name + f'_{fold_name}',
                horizon_label=horizon_label.replace(" ",""),
                fold=fold_name
            )

            clear_output(wait = True)
            # deletar objetos grandes
            del mlf
            del lgb_model
            del rf_model
            del results_ml
            del results_list

            gc.collect()

Running RQ2 | go3 | centro_oeste | 7 days


## Merge

In [None]:
BASE_RESULTS = Path("Results/RQ2")
FULL_DIR = BASE_RESULTS / "full"
FULL_DIR.mkdir(parents=True, exist_ok=True)

families = ["ml", "dl", "neural"]

def build_full_results(pollutant, fold, horizon_label):
    """
    Concatena fam√≠lias horizontalmente via concat,
    garantindo que n√£o haja perda de linhas.
    """

    dfs = []

    for family in families:
        fpath = BASE_RESULTS / family / f"{pollutant}_{fold}_{horizon_label}.csv"

        if fpath.exists():
            df = pd.read_csv(fpath)

            if "fit_time_seconds" in df.columns:
                df = df.drop(columns=["fit_time_seconds"])

            dfs.append(df)
        else:
            print(f"‚ö†Ô∏è Missing: {fpath}")

    if not dfs:
        return None

    # Ordenar todos igualmente
    for i in range(len(dfs)):
        dfs[i] = dfs[i].sort_values(
            ["unique_id", "ds", "cutoff", "fold"]
        ).reset_index(drop=True)

    # Usar o primeiro como base
    base = dfs[0][["unique_id", "ds", "cutoff", "y", "fold"]].copy()

    # Adicionar modelos das outras fam√≠lias
    for df in dfs:

        model_cols = [
            c for c in df.columns
            if c not in ["unique_id", "ds", "cutoff", "y", "fold"]
        ]

        base = pd.concat(
            [base, df[model_cols]],
            axis=1
        )

    return base


for pollutant_name, pollutant_dict in pollutant_dict.items():

    for fold_name in pollutant_dict.keys():

        if fold_name == "scaler":
            continue

        for horizon_label in experiments_dict.keys():

            horizon_clean = horizon_label.replace(" ", "")

            print(f"Building FULL | {pollutant_name} | {fold_name} | {horizon_clean}")

            df_full = build_full_results(
                pollutant=pollutant_name,
                fold=fold_name,
                horizon_label=horizon_clean
            )

            if df_full is None:
                continue

            df_full.to_csv(
                FULL_DIR / f"{pollutant_name}_{fold_name}_{horizon_clean}.csv",
                index=False
            )

## **Estat√≠sticas**

In [None]:
# ==================================================
# METRICS
# ==================================================

def mae(y, yhat):
    return np.mean(np.abs(y - yhat))

def mse(y, yhat):
    return np.mean((y - yhat) ** 2)

def rmse(y, yhat):
    return np.sqrt(mse(y, yhat))

def smape(y, yhat):
    denom = (np.abs(y) + np.abs(yhat)) / 2
    mask = denom != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs(y[mask] - yhat[mask]) / denom[mask])

def mae_conditional(y, yhat, threshold):
    mask = y >= threshold
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs(y[mask] - yhat[mask]))

def bias_conditional(y, yhat, threshold):
    mask = y >= threshold
    if mask.sum() == 0:
        return np.nan
    return np.mean(yhat[mask] - y[mask])

def skill_score(model_err, baseline_err):
    if baseline_err == 0 or np.isnan(baseline_err):
        return np.nan
    return 1 - model_err / baseline_err

def extreme_event_metrics(y, yhat, threshold):
    y_event = y >= threshold
    yhat_event = yhat >= threshold

    tp = np.sum(y_event & yhat_event)
    fp = np.sum(~y_event & yhat_event)
    fn = np.sum(y_event & ~yhat_event)

    precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
    recall = tp / (tp + fn) if (tp + fn) > 0 else np.nan

    if precision > 0 and recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = np.nan

    return precision, recall, f1


# ==================================================
# CONFIG
# ==================================================

RQ2_BASE = Path("Results/RQ2/full")
RQ1_BASE = Path("Results/RQ1/full")

baseline_name = "Naive"
WINDOW_P95 = 365 * 8  # 1 ano em 3h

records = []


# ==================================================
# LOOP RQ2 FILES
# ==================================================

for file in RQ2_BASE.glob("*.csv"):

    # Ex: go3_sudeste_1days.csv
    parts = file.stem.split("_")

    pollutant = parts[0]
    horizon = parts[-1]
    fold = "_".join(parts[1:-1])

    print(f"Evaluating | {pollutant} | {fold} | {horizon}")

    df_pred = pd.read_csv(file)
    df_pred["ds"] = pd.to_datetime(df_pred["ds"])
    df_pred["cutoff"] = pd.to_datetime(df_pred["cutoff"])

    # --------------------------------------
    # GROUND TRUTH (VINDO DO pollutant_dict)
    # --------------------------------------
    test_df = pollutant_dict[pollutant][fold]["test_df"].copy()
    test_df["ds"] = pd.to_datetime(test_df["ds"])

    # --------------------------------------
    # LOAD RQ1 (PODE N√ÉO EXISTIR PARA ALGUNS IDs)
    # --------------------------------------
    rq1_path = RQ1_BASE / f"{pollutant}_{horizon}.csv"

    if rq1_path.exists():
        df_rq1 = pd.read_csv(rq1_path)
        df_rq1["ds"] = pd.to_datetime(df_rq1["ds"])
        df_rq1["cutoff"] = pd.to_datetime(df_rq1["cutoff"])

        df_pred = df_pred.merge(
            df_rq1,
            on=["unique_id", "ds", "cutoff"],
            suffixes=("", "_RQ1"),
            how="left"
        )

    # --------------------------------------
    # MODELOS
    # --------------------------------------
    model_cols = [
        c for c in df_pred.columns
        if (
            c not in ["unique_id", "ds", "cutoff", "y", "fold"]
            and not c.endswith("_RQ1")
            and df_pred[c].dtype in [np.float64, np.float32]
        )
    ]

    # --------------------------------------
    # LOOP POR UNIQUE_ID + CUTOFF
    # --------------------------------------
    for (uid, cutoff), df_fold in df_pred.groupby(["unique_id", "cutoff"]):

        y_train_full = (
            test_df
            .query("unique_id == @uid and ds <= @cutoff")["y"]
            .values
        )

        if len(y_train_full) < WINDOW_P95:
            continue

        p95 = np.percentile(y_train_full[-WINDOW_P95:], 95)

        y_full = df_fold["y"].values
        y_base_full = df_fold[baseline_name].values

        for model in model_cols:

            yhat_full = df_fold[model].values

            mask_valid = ~np.isnan(yhat_full)
            if mask_valid.sum() == 0:
                continue

            y_valid = y_full[mask_valid]
            yhat = yhat_full[mask_valid]
            y_base_valid = y_base_full[mask_valid]

            # ================= RQ2 =================
            mae_model = mae(y_valid, yhat)
            rmse_model = rmse(y_valid, yhat)
            smape_model = smape(y_valid, yhat)
            mae_p95 = mae_conditional(y_valid, yhat, p95)

            mae_base = mae(y_valid, y_base_valid)
            rmse_base = rmse(y_valid, y_base_valid)
            smape_base = smape(y_valid, y_base_valid)
            mae_base_p95 = mae_conditional(y_valid, y_base_valid, p95)

            skill_mae = skill_score(mae_model, mae_base)
            skill_rmse = skill_score(rmse_model, rmse_base)
            skill_smape = skill_score(smape_model, smape_base)
            skill_p95 = skill_score(mae_p95, mae_base_p95)

            bias_p95 = bias_conditional(y_valid, yhat, p95)
            precision, recall, f1 = extreme_event_metrics(y_valid, yhat, p95)

            # ================= RQ1 =================
            model_rq1_col = model + "_RQ1"

            if model_rq1_col in df_fold.columns:
                yhat_rq1 = df_fold[model_rq1_col].values
                mask_rq1 = ~np.isnan(yhat_rq1)
                mask_joint = mask_valid & mask_rq1

                if mask_joint.sum() > 0:
                    y_joint = y_full[mask_joint]
                    yhat_rq1_joint = yhat_rq1[mask_joint]

                    mae_rq1 = mae(y_joint, yhat_rq1_joint)
                    rmse_rq1 = rmse(y_joint, yhat_rq1_joint)
                    smape_rq1 = smape(y_joint, yhat_rq1_joint)
                    mae_p95_rq1 = mae_conditional(y_joint, yhat_rq1_joint, p95)

                    skill_mae_vs_rq1 = skill_score(mae_model, mae_rq1)
                    skill_rmse_vs_rq1 = skill_score(rmse_model, rmse_rq1)
                    skill_smape_vs_rq1 = skill_score(smape_model, smape_rq1)
                    skill_p95_vs_rq1 = skill_score(mae_p95, mae_p95_rq1)
                else:
                    mae_rq1 = rmse_rq1 = smape_rq1 = mae_p95_rq1 = np.nan
                    skill_mae_vs_rq1 = skill_rmse_vs_rq1 = skill_smape_vs_rq1 = skill_p95_vs_rq1 = np.nan
            else:
                mae_rq1 = rmse_rq1 = smape_rq1 = mae_p95_rq1 = np.nan
                skill_mae_vs_rq1 = skill_rmse_vs_rq1 = skill_smape_vs_rq1 = skill_p95_vs_rq1 = np.nan

            # ================= SAVE =================
            records.append({
                "pollutant": pollutant,
                "fold": fold,
                "horizon": horizon,
                "unique_id": uid,
                "cutoff": cutoff,
                "model": model,

                "MAE": mae_model,
                "RMSE": rmse_model,
                "sMAPE": smape_model,
                "MAE_p95": mae_p95,

                "Skill_MAE": skill_mae,
                "Skill_RMSE": skill_rmse,
                "Skill_sMAPE": skill_smape,
                "Skill_p95": skill_p95,

                "Bias_p95": bias_p95,
                "Precision_p95": precision,
                "Recall_p95": recall,
                "F1_p95": f1,

                "MAE_RQ1": mae_rq1,
                "RMSE_RQ1": rmse_rq1,
                "sMAPE_RQ1": smape_rq1,
                "MAE_p95_RQ1": mae_p95_rq1,

                "Skill_MAE_vs_RQ1": skill_mae_vs_rq1,
                "Skill_RMSE_vs_RQ1": skill_rmse_vs_rq1,
                "Skill_sMAPE_vs_RQ1": skill_smape_vs_rq1,
                "Skill_p95_vs_RQ1": skill_p95_vs_rq1,
            })


# ==================================================
# SAVE
# ==================================================

metrics_df = pd.DataFrame(records)
metrics_df.to_csv("Results/RQ2/metrics.csv", index=False)

print("Done.")