## **RQ2**

> _Um modelo treinado em múltiplas células espaciais generaliza melhor para células não vistas do que modelos treinados localmente?_

Para garantir rigor metodológico e evitar vazamentos espaciais, a generalização será avaliada por protocolos explicitamente blindados, incluindo:
* Split por blocos espaciais, evitando treino e teste em células geograficamente adjacentes;
* Dois cenários complementares:
    * Leave-region-out: regiões inteiras são excluídas do treino e usadas apenas para teste;
    * Leave-cell-out: células individuais não vistas são usadas para teste, respeitando separação espacial mínima.
* Esses protocolos permitem distinguir entre interpolação espacial e generalização genuína.

## Libraries

In [1]:
import os
os.environ['NIXTLA_ID_AS_COL'] = '1'

import optuna
import itertools
import shutil
import time
import functools
import gc
import requests

import pandas as pd
import numpy as np
np.random.seed(1)

import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots
import plotly.io as pio
from graphmodex import plotlymodex
pio.renderers.default = 'notebook'

import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib as mpl

import joblib
import pickle
from IPython.display import clear_output

In [2]:
import neuralforecast
import mlforecast
import statsforecast
import utilsforecast
import coreforecast

from statsforecast import StatsForecast
from statsforecast.models import (
    Naive, SeasonalNaive, 
    AutoARIMA, AutoCES, AutoETS, AutoTheta,
)

from mlforecast import MLForecast
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences

from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS, NHITS
from neuralforecast.losses.pytorch import MSE, SMAPE, MAE

from mlforecast.utils import PredictionIntervals

from pytorch_lightning import Trainer
trainer = Trainer(
    max_steps=4,
    logger=False,
    enable_progress_bar=False,
    enable_model_summary=False
)

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="optuna")


# ==================================================
# REPRODUCTIBILITY
# ==================================================
import random
import torch

SEED = 1
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
💡 Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.


### Results Storage

In [3]:
from pathlib import Path

BASE_RESULTS = Path("Results/RQ2")
BASE_RESULTS.mkdir(parents=True, exist_ok=True)

def save_results(df, model_family, pollutant, horizon_label):
    out_dir = BASE_RESULTS / model_family
    out_dir.mkdir(parents=True, exist_ok=True)

    fname = f"{pollutant}_{horizon_label}.csv"
    df.to_csv(out_dir / fname, index=False)

## Data

In [4]:
# ===============================
# DATA
# ===============================
df = pd.read_parquet(r'..\Data\CAMS\processed\eac4_era5_2010_2024_brasil_enhanced.parquet')

pm10 = (
    df
    .copy()
    .rename(columns={
        'pm10': 'y',
        'valid_time': 'ds'        
    })
    [['unique_id', 'ds', 'y', 
      'latitude', 'longitude', 'state', 
      'name_region','is_coastal', 
      'urban_proxy_no2', 'industrial_proxy_so2']]
)

pm2p5 = (
    df
    .copy()
    .rename(columns={
        'pm2p5': 'y',
        'valid_time': 'ds'        
    })
    [['unique_id', 'ds', 'y', 
      'latitude', 'longitude', 'state', 
      'name_region','is_coastal', 
      'urban_proxy_no2', 'industrial_proxy_so2']]
)

go3 = (
    df
    .copy()
    .rename(columns={
        'go3': 'y',
        'valid_time': 'ds'        
    })
    [['unique_id', 'ds', 'y', 
      'latitude', 'longitude', 'state', 
      'name_region','is_coastal', 
      'urban_proxy_no2', 'industrial_proxy_so2']]
)

no2 = (
    df
    .copy()
    .rename(columns={
        'no2': 'y',
        'valid_time': 'ds'        
    })
    [['unique_id', 'ds', 'y', 
      'latitude', 'longitude', 'state', 
      'name_region','is_coastal', 
      'urban_proxy_no2', 'industrial_proxy_so2']]
)

In [5]:
pollutants_global_dict = {
    'go3': {
        'sudeste': {
            'train_df': go3.query("name_region != 'Sudeste'"),
            'test_df': go3.query("name_region == 'Sudeste'"),
            'scaler': 1e8,
        },
        'sul': {
            'train_df': go3.query("name_region != 'Sul'"),
            'test_df': go3.query("name_region == 'Sul'"),
            'scaler': 1e8,
        },
        'centro_oeste': {
            'train_df': go3.query("name_region != 'Centro Oeste'"),
            'test_df': go3.query("name_region == 'Centro Oeste'"),
            'scaler': 1e8,
        },
    },
    'no2': {
        'sudeste': {
            'train_df': no2.query("name_region != 'Sudeste'"),
            'test_df': no2.query("name_region == 'Sudeste'"),
            'scaler': 1e9,
        },
        'sul': {
            'train_df': no2.query("name_region != 'Sul'"),
            'test_df': no2.query("name_region == 'Sul'"),
            'scaler': 1e9,
        },
        'centro_oeste': {
            'train_df': no2.query("name_region != 'Centro Oeste'"),
            'test_df': no2.query("name_region == 'Centro Oeste'"),
            'scaler': 1e9,
        },
    },
    'pm10': {
        'sudeste': {
            'train_df': pm10.query("name_region != 'Sudeste'"),
            'test_df': pm10.query("name_region == 'Sudeste'"),
            'scaler': 1e8,
        },
        'sul': {
            'train_df': pm10.query("name_region != 'Sul'"),
            'test_df': pm10.query("name_region == 'Sul'"),
            'scaler': 1e8,
        },
        'centro_oeste': {
            'train_df': pm10.query("name_region != 'Centro Oeste'"),
            'test_df': pm10.query("name_region == 'Centro Oeste'"),
            'scaler': 1e8,
        },
    },
    'pm2p5': {
        'sudeste': {
            'train_df': pm2p5.query("name_region != 'Sudeste'"),
            'test_df': pm2p5.query("name_region == 'Sudeste'"),
            'scaler': 1e9,
        },
        'sul': {
            'train_df': pm2p5.query("name_region != 'Sul'"),
            'test_df': pm2p5.query("name_region == 'Sul'"),
            'scaler': 1e9,
        },
        'centro_oeste': {
            'train_df': pm2p5.query("name_region != 'Centro Oeste'"),
            'test_df': pm2p5.query("name_region == 'Centro Oeste'"),
            'scaler': 1e8,
        },
    },
}

In [6]:
cells_meta = df[['unique_id', 'latitude', 'longitude', 'state',
                 'urban_proxy_no2', 'industrial_proxy_so2', 'is_coastal']].drop_duplicates()

# índices de grade
lat_vals = np.sort(cells_meta['latitude'].unique())
lon_vals = np.sort(cells_meta['longitude'].unique())

lat_to_i = {lat: i for i, lat in enumerate(lat_vals)}
lon_to_j = {lon: j for j, lon in enumerate(lon_vals)}

cells_meta['i'] = cells_meta['latitude'].map(lat_to_i)
cells_meta['j'] = cells_meta['longitude'].map(lon_to_j)

def spatial_buffer(cell, cells_meta, r=1):
    i, j = cell['i'], cell['j']
    return cells_meta[
        (cells_meta['i'].between(i-r, i+r)) &
        (cells_meta['j'].between(j-r, j+r))
    ]['unique_id'].tolist()

TEST_CELLS = (
    cells_meta
    .assign(
        cell_type=lambda x:
            np.select(
                [
                    x.is_coastal,
                    x.industrial_proxy_so2,
                    x.urban_proxy_no2,
                ],
                ['coastal', 'industrial', 'urban'],
                default='rural'
            )
    )
    .groupby('cell_type', group_keys=False)
    .apply(lambda g: g.sample(n=min(15, len(g)), random_state=SEED))
)

for _, test_cell in TEST_CELLS.iterrows():

    test_uid = test_cell.unique_id
    buffer_uids = spatial_buffer(test_cell, cells_meta, r=1)

    train_uids = set(cells_meta.unique_id) - set(buffer_uids)

    train_df = pm10.query("unique_id in @train_uids")
    test_df  = pm10.query("unique_id == @test_uid")






In [7]:
cells_meta = df[['unique_id', 'latitude', 'longitude', 'state',
                 'urban_proxy_no2', 'industrial_proxy_so2', 'is_coastal']].drop_duplicates().rename(columns={
                     'latitude': 'lat',
                     'longitude': 'lon',
                 })

In [8]:
# =========================
# GEOJSON DOS ESTADOS
# =========================
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson"
states_geojson = requests.get(url).json()

# =========================
# FIGURA
# =========================
fig = go.Figure()

# =========================
# ESTADOS (CONTORNO)
# =========================
fig.add_trace(
    go.Choropleth(
        geojson=states_geojson,
        locations=[f['properties']['name'] for f in states_geojson['features']],
        featureidkey="properties.name",
        z=[0] * len(states_geojson['features']),  # dummy
        colorscale=[[0, 'white'], [1, 'white']],
        showscale=False,
        marker_line_color='black',
        marker_line_width=1.2,
        hoverinfo='skip',
        name='States'
    )
)



# =========================
# NON-PROXY (FUNDO)
# =========================
fig.add_trace(
    go.Scattergeo(
        lon=cells_meta['lon'],
        lat=cells_meta['lat'],
        mode='markers',
        marker=dict(
            size=6,
            color='rgba(100, 100, 100, 1)',
            opacity=0.4
        ),
        name='Non-proxy cells'
    )
)

# =========================
# INDUSTRIAIS
# =========================
industrial = cells_meta[cells_meta['industrial_proxy_so2']]
fig.add_trace(
    go.Scattergeo(
        lon=industrial['lon'],
        lat=industrial['lat'],
        mode='markers',
        marker=dict(
            size=9,
            color='#6f4e37',
            symbol='square',
            line=dict(color='black', width=0.7)
        ),
        name='Industrial proxy (SO₂ p75)'
    )
)

# =========================
# URBANAS
# =========================
urban = cells_meta[cells_meta['urban_proxy_no2']]
fig.add_trace(
    go.Scattergeo(
        lon=urban['lon'],
        lat=urban['lat'],
        mode='markers',
        marker=dict(
            size=7,
            color='#36c064',
            symbol='circle',
            line=dict(color='black', width=0.6)
        ),
        name='Urban proxy (NO₂ p75)'
    )
)

# =========================
# COSTEIRAS
# =========================
coastal = cells_meta[cells_meta['is_coastal']]
fig.add_trace(
    go.Scattergeo(
        lon=coastal['lon'],
        lat=coastal['lat'],
        mode='markers',
        marker=dict(
            size=10,
            color='#6ec4cf',
            symbol='triangle-up',
            line=dict(color='black', width=0.8)
        ),
        name='Coastal cells'
    )
)

# =========================
# LAYOUT
# =========================
fig.update_layout(
    width=800, height=800,
    title=dict(
        text='Spatial distribution of CAMS grid cells and proxy categories',
        x=0.5,
        font=dict(
            family='Times New Roman',
            size=18
        )
    ),
    legend=dict(
        x=0.86,          # quase no limite direito
        y=0.02,          # quase no limite inferior
        xanchor='right',
        yanchor='bottom',
        bgcolor='white',
        bordercolor='black',
        borderwidth=1,
        font=dict(
            family='Times New Roman',
            size=12
        )
    ),
    geo=dict(
        scope='south america',
        projection_type='mercator',

        # CONTORNO DOS ESTADOS
        showcountries=True,
        countrycolor='black',
        showsubunits=True,
        subunitcolor='black',
        subunitwidth=1,

        # FUNDO
        showland=True,
        landcolor='white',
        showocean=True,
        oceancolor='rgba(230,230,230,0.4)',

        # ZOOM NO DOMÍNIO
        lataxis=dict(range=[-35, -15]),
        lonaxis=dict(range=[-58, -38]),
        resolution=50
    ),
    margin=dict(l=20, r=20, t=60, b=20)
)

fig.show()

In [12]:
df.groupby('state')['unique_id'].count().sort_values(ascending=False)

state
Minas Gerais          3988712
Mato Grosso Do Sul    2410760
Rio Grande Do Sul     1840944
São Paulo             1621784
Paraná                1446456
Santa Catarina         701312
Espírito Santo         350656
Rio De Janeiro         262992
Name: unique_id, dtype: int64

### Setup

In [9]:
# ===============================
# CONFIG
# ===============================
steps_per_day = 8
two_year_steps = 2 * 365 * steps_per_day
target_windows = 30

FREQ = '3h'
SEASON_LENGTH = 8 

experiments_dict = {
    '1 days': {
        'horizon': 8*1,
        'step_size': max(8*1, two_year_steps // target_windows),
        'windows': target_windows,
    },
    '7 days': {
        'horizon': 8*7,
        'step_size': max(8*7, two_year_steps // target_windows),
        'windows': target_windows,
    },
    '14 days': {
        'horizon': 8*14,
        'step_size': max(8*14, two_year_steps // target_windows),
        'windows': target_windows,
    },
    '30 days': {
        'horizon': 8*30,
        'step_size': 8*30,
        'windows': two_year_steps // (8*30),  # 24
    },
}

In [10]:
print(
    (
        df
        .groupby(['unique_id', 'state'])['latitude'].count()
        .reset_index()
        .groupby('state')['latitude'].count()
    )
)

state
Espírito Santo         8
Mato Grosso Do Sul    55
Minas Gerais          91
Paraná                33
Rio De Janeiro         6
Rio Grande Do Sul     42
Santa Catarina        16
São Paulo             37
Name: latitude, dtype: int64


In [11]:
proxy_ = 'is_coastal'
print(
    proxy_, '\n\n', 
    (
        df
        .groupby(['unique_id', 'state'])[proxy_].sum()
        .reset_index()
        .query(f"{proxy_} != 0")
        .groupby('state')[proxy_].count()
    )
)

is_coastal 

 state
Espírito Santo       4
Paraná               2
Rio De Janeiro       4
Rio Grande Do Sul    4
Santa Catarina       4
São Paulo            4
Name: is_coastal, dtype: int64
