# **COBEQ 2025**

## Libs

In [1]:
import pandas as pd
import numpy as np
np.random.seed(1)
import optuna
import itertools
import shutil
from functools import partial

import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots
import time

from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS, NHITS
import sklearn.metrics as metrics

ponte = pd.read_pickle(r'Data\Data_Ponte_dos_Remedios.pkl')
del ponte['o3']
guarulhos = pd.read_pickle(r'Data\Data_Guarulhos.pkl')
guarulhos = guarulhos[['date','o3']]

data = ponte.merge(guarulhos, on='date', how='outer')
data.reset_index(drop=True)

import joblib
import pickle
from IPython.display import clear_output
import os
os.environ['NIXTLA_ID_AS_COL'] = '1'

from pytorch_lightning import Trainer
trainer = Trainer(
    max_steps=4,
    logger=False,
    enable_progress_bar=False,
    enable_model_summary=False  # Disable model summary
)

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="optuna")

from TimeObjectModule import TimeObject

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


## **Experimental Planning**

## **Optuna**

### N-HiTS

In [None]:
# Define the objective function
def objective(trial, pollutant, horizon):
    # Hyperparameter search space
    input_size = trial.suggest_int('input_size', 4, 156, step=1)
    n_stacks = trial.suggest_int('n_stacks', 3, 7, step=1)
    n_blocks = trial.suggest_int('n_blocks', 1, 7, step=1)
    max_steps = trial.suggest_int('max_steps', 10, 700, step=1)
    local_scalar_type = trial.suggest_categorical('local_scalar_type', [None, 'standard', 'boxcox', 'minmax'])
    n_pool_kernel_size = trial.suggest_categorical('n_pool_kernel_size', [list(combination) for combination in list(itertools.product([1, 2, 3], repeat=3))])
    n_freq_downsample = trial.suggest_categorical('n_freq_downsample', [list(combination) for combination in list(itertools.product([1, 4, 12, 52], repeat=3))])

    mape = []
    smape = []
    max = []
    mae = []
    mse = []
    # Split for cross validation
    for split in [1,184,366]:
        print(f'\nPollutant = {pollutant} \nh = {horizon} \nTrial = {trial.number+1}\n')
        # Instantiate TimeObject and prepare training data
        obj = TimeObject(df=data[:-split], column=pollutant, agg_freq='W')
        obj.NIXTLA_train_test(split=horizon)

        # Define the model
        model = NHITS(
            h=horizon,
            input_size=input_size,
            stack_types=n_stacks*['identity'],
            n_freq_downsample=n_freq_downsample+(n_stacks-len(n_freq_downsample))*[1],
            n_blocks=n_stacks*[n_blocks],
            n_pool_kernel_size=(n_stacks-len(n_pool_kernel_size))*[1]+n_pool_kernel_size,
            pooling_mode="MaxPool1d",
            activation="ReLU",
            interpolation_mode='linear',
            max_steps=max_steps,
            val_check_steps=10,
            early_stop_patience_steps=int(np.round(max_steps/(20),0)),
        )

        # Initialize NeuralForecast and fit the model
        fcst = NeuralForecast(
            models=[model],
            freq='W',
            local_scaler_type=local_scalar_type
        )
        fcst.fit(df=obj.Y_train, verbose=False, val_size=horizon+1)
        prediction = fcst.predict(df=obj.Y_train, verbose=False)

        # Evaluate metrics
        obj.metrics_(forecast_df=prediction, method='NHITS')
        mape.append(obj.metrics['mape'])
        smape.append(obj.metrics['smape'])
        max.append(obj.metrics['max'])
        mae.append(obj.metrics['mae'])
        mse.append(obj.metrics['mse'])
        
        clear_output(wait=True)

    try:
        directory_path = "lightning_logs"
        if os.path.exists(directory_path):
            shutil.rmtree(directory_path)
    except:
        ...

    mape = np.mean(mape)
    smape = np.mean(smape)
    max = np.mean(max)
    mae = np.mean(mae)
    mse = np.mean(mse)

    # Collect the results
    results.append({
        'poll': pollutant,
        'freq': 'W',
        'split': split,
        'h': horizon,
        'input_size': input_size,
        'n_stacks': n_stacks,
        'n_blocks': n_blocks,
        'max_steps': max_steps,
        'local_scalar_type': local_scalar_type,
        'n_pool_kernel_size': n_pool_kernel_size,
        'n_freq_downsample': n_freq_downsample,
        'mape': mape,
        'smape': smape,
        'max': max,
        'mae': mae,
        'mse': mse,
    })

    # The objective for Optuna is to minimize the MAE (or maximize a metric)
    return smape, mae  # Any metric you want to optimize

for pollutant in data[['pm10']]:
    for h in [12, 26, 52, 78]:
        # Initialize the results list
        results = []
        # Define the optimization study_nhits
        study_nhits = optuna.create_study(directions=['minimize','minimize'])  # Minimize the MAE

        # Run the optimization with the number of trials you want
        study_nhits.optimize(partial(objective, pollutant=pollutant, horizon=h), n_trials=300)

        clear_output(wait=True)
        NHITS_W = pd.DataFrame(results)

        output_dir = fr'Results COBEQ\NHITS (W)\{pollutant}'
        os.makedirs(output_dir, exist_ok=True)
        NHITS_W.to_pickle(fr'Results COBEQ\NHITS (W)\{pollutant}\{h}W_Df.pkl')
        joblib.dump(study_nhits, fr"Results COBEQ\NHITS (W)\{pollutant}\{h}W_Study.pkl")

[I 2025-03-25 10:13:56,508] Trial 299 finished with values: [21.40198, 5.75624] and parameters: {'input_size': 67, 'n_stacks': 3, 'n_blocks': 7, 'max_steps': 624, 'local_scalar_type': 'standard', 'n_pool_kernel_size': [1, 1, 1], 'n_freq_downsample': [12, 1, 12]}.


### NBEATS

In [None]:
# Define the objective function
def objective(trial, pollutant, horizon):
    # Hyperparameter search space
    input_size = trial.suggest_int('input_size', 4, 156, step=1)
    n_stacks = trial.suggest_int('n_stacks', 2, 7, step=1)
    n_blocks = trial.suggest_int('n_blocks', 1, 5, step=1)
    max_steps = trial.suggest_int('max_steps', 10, 700, step=1)
    local_scalar_type = trial.suggest_categorical('local_scalar_type', [None, 'standard', 'boxcox', 'minmax'])
    interpretability = trial.suggest_categorical('interpretability', [list(combination) for combination in list(itertools.product(['seasonality', 'trend', 'identity'], repeat=2))])

    mape = []
    smape = []
    max = []
    mae = []
    mse = []
    # Split for cross validation
    for split in [1]:
        print(f'\nPollutant = {pollutant} \nh = {horizon} \nTrial = {trial.number+1}\n')
        # Instantiate TimeObject and prepare training data
        obj = TimeObject(df=data[:-split], column=pollutant, agg_freq='W')
        obj.NIXTLA_train_test(split=horizon)

        # Define the model
        model = NBEATS(
            h=horizon,
            input_size=input_size,
            stack_types=interpretability+(n_stacks-len(interpretability))*['identity'],
            n_blocks=n_stacks * [n_blocks],
            max_steps=max_steps,
            learning_rate=1e-3,
            val_check_steps=10,
            early_stop_patience_steps=int(np.round(max_steps/(20),0)),
        )

        # Initialize NeuralForecast and fit the model
        fcst = NeuralForecast(
            models=[model],
            freq='W',
            local_scaler_type=local_scalar_type
        )
        fcst.fit(df=obj.Y_train, verbose=False, val_size=horizon+1)
        prediction = fcst.predict(df=obj.Y_train, verbose=False)

        # Evaluate metrics
        obj.metrics_(forecast_df=prediction, method='NBEATS')
        mape.append(obj.metrics['mape'])
        smape.append(obj.metrics['smape'])
        max.append(obj.metrics['max'])
        mae.append(obj.metrics['mae'])
        mse.append(obj.metrics['mse'])
            
        clear_output(wait=True)

        try:
            directory_path = "lightning_logs"
            if os.path.exists(directory_path):
                shutil.rmtree(directory_path)
        except:
            ...

    mape = np.mean(mape)
    smape = np.mean(smape)
    max = np.mean(max)
    mae = np.mean(mae)
    mse = np.mean(mse)

    # Collect the results
    results.append({
        'pollutant': pollutant,
        'freq': 'W',
        'split': split,
        'h': horizon,
        'input_size': input_size,
        'n_stacks': n_stacks,
        'n_blocks': n_blocks,
        'max_steps': max_steps,
        'local_scalar_type': local_scalar_type,
        'interpretability': interpretability,
        'mape': mape,
        'smape': smape,
        'max': max,
        'mae': mae,
        'mse': mse,
    })

    # The objective for Optuna is to minimize the MAE (or maximize a metric)
    return smape, mae  # Any metric you want to optimize

for pollutant in data[['pm10']]:
    for h in [12, 26, 52, 78]:
        # Initialize the results list
        results = []
        # Define the optimization study_nbeats
        study_nbeats = optuna.create_study(directions=['minimize','minimize'])  # Minimize the MAE

        # Run the optimization with the number of trials you want
        study_nbeats.optimize(partial(objective, pollutant=pollutant, horizon=h), n_trials=300)

        clear_output(wait=True)
        NBEATS_W = pd.DataFrame(results)

        output_dir = fr'Results COBEQ\NBEATS (W)\{pollutant}'
        os.makedirs(output_dir, exist_ok=True)
        NBEATS_W.to_pickle(fr'Results COBEQ\NBEATS (W)\{pollutant}\{h}W_Df.pkl')
        joblib.dump(study_nbeats, fr"Results COBEQ\NBEATS (W)\{pollutant}\{h}W_Study.pkl")

[I 2025-03-25 13:53:03,863] Trial 299 finished with values: [21.49116, 5.70076] and parameters: {'input_size': 86, 'n_stacks': 5, 'n_blocks': 3, 'max_steps': 616, 'local_scalar_type': 'boxcox', 'interpretability': ['trend', 'trend']}.


## **Statistical**

In [None]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, AutoCES, AutoETS, AutoTheta

results_stats = []
for pollutant in ['pm10']:
    for h in [12, 26, 52, 78]:

        obj = TimeObject(data, pollutant, agg_freq='W')
        obj.NIXTLA_train_test(split=h)

        season_length = 52 # Monthly data 
        horizon = len(obj.Y_train) # number of predictions

        models = [
            # AutoARIMA(season_length=season_length, alias='AutoARIMA'),
            AutoCES(season_length=season_length, model='Z', alias='AutoCES-Z'),
            AutoCES(season_length=season_length, model='S', alias='AutoCES-S'),
            AutoCES(season_length=season_length, model='P', alias='AutoCES-P'),
            AutoCES(season_length=season_length, model='N', alias='AutoCES-N'),
            AutoTheta(season_length=season_length, decomposition_type="multiplicative", alias='AutoTheta-Multi'),
            AutoTheta(season_length=season_length, decomposition_type="additive", alias='AutoTheta-Add'),
        ]
        models = models + [
            AutoETS(season_length=season_length, model=ets, alias=f'AutoETS-{ets}')
            for ets in [f"{e}{t}{s}" for e in ['Z',]
                        for t in ['Z', 'A', 'N'] 
                        for s in ['Z', 'A', 'M', 'N'] 
        ]]

        frct = StatsForecast(models=models, freq='W')
        frct.fit(df=obj.Y_train)
        predicted = frct.predict(h=h)

        columns = predicted.columns
        columns = columns[(columns != 'ds') & (columns != 'unique_id')]

        results_stats = pd.DataFrame()
        for method in columns:
            obj.metrics_(predicted, method=method)
            results_stats = pd.concat([results_stats, pd.DataFrame({
                'pollutant': [pollutant],
                'method': [method],
                'freq': ['W'],
                'h': [h],
                'mape': [obj.metrics['mape']],
                'smape': [obj.metrics['smape']],
                'max': [obj.metrics['max']],
                'mae': [obj.metrics['mae']],
                'mse': [obj.metrics['mse']]
            })])
        
        # ======================================================================================================

        nbeats = joblib.load(fr"Results COBEQ\NBEATS (W)\{pollutant}\{h}W_Study.pkl")
        model = NBEATS(
            h=h,
            input_size=nbeats.best_trials[0].params.get('input_size'),
            stack_types=nbeats.best_trials[0].params.get('interpretability')+(nbeats.best_trials[0].params.get('n_stacks')-len(nbeats.best_trials[0].params.get('interpretability')))*['identity'],
            n_blocks=nbeats.best_trials[0].params.get('n_stacks') * [nbeats.best_trials[0].params.get('n_blocks')],
            max_steps=nbeats.best_trials[0].params.get('max_steps'),
            learning_rate=1e-3,
            val_check_steps=10,
        )
        fcst = NeuralForecast(
            models=[model],
            freq='W',
            local_scaler_type=nbeats.best_trials[0].params.get('local_scalar_type')
        )
        fcst.fit(df=obj.Y_train, verbose=False)
        predicted = fcst.predict(df=obj.Y_train, verbose=False)
        obj.metrics_(predicted, method='NBEATS')
        results_stats = pd.concat([results_stats, pd.DataFrame({
            'pollutant': [pollutant],
            'method': ['NBEATS'],
            'freq': ['W'],
            'h': [h],
            'mape': [obj.metrics['mape']],
            'smape': [obj.metrics['smape']],
            'max': [obj.metrics['max']],
            'mae': [obj.metrics['mae']],
            'mse': [obj.metrics['mse']]
        })])

        # ======================================================================================================
        
        nhits = joblib.load(fr"Results COBEQ\NHITS (W)\{pollutant}\{h}W_Study.pkl")
        model = NHITS(
            h=h,
            input_size=nhits.best_trials[0].params.get('input_size'),
            stack_types=nhits.best_trials[0].params.get('n_stacks')*['identity'],
            n_freq_downsample=nhits.best_trials[0].params.get('n_freq_downsample')+(nhits.best_trials[0].params.get('n_stacks')-len(nhits.best_trials[0].params.get('n_freq_downsample')))*[1],
            n_blocks=nhits.best_trials[0].params.get('n_stacks')*[nhits.best_trials[0].params.get('n_blocks')],
            n_pool_kernel_size=(nhits.best_trials[0].params.get('n_stacks')-len(nhits.best_trials[0].params.get('n_pool_kernel_size')))*[1]+nhits.best_trials[0].params.get('n_pool_kernel_size'),
            pooling_mode="MaxPool1d",
            activation="ReLU",
            interpolation_mode='linear',
            max_steps=nhits.best_trials[0].params.get('max_steps'),
            val_check_steps=10,
        )
        fcst = NeuralForecast(
            models=[model],
            freq='W',
            local_scaler_type=nhits.best_trials[0].params.get('local_scalar_type')
        )
        fcst.fit(df=obj.Y_train, verbose=False)
        predicted = fcst.predict(df=obj.Y_train, verbose=False)
        obj.metrics_(predicted, method='NHITS')
        results_stats = pd.concat([results_stats, pd.DataFrame({
            'pollutant': [pollutant],
            'method': ['NHITS'],
            'freq': ['W'],
            'h': [h],
            'mape': [obj.metrics['mape']],
            'smape': [obj.metrics['smape']],
            'max': [obj.metrics['max']],
            'mae': [obj.metrics['mae']],
            'mse': [obj.metrics['mse']]
        })])

        # ======================================================================================================

        results_stats = pd.DataFrame(results_stats)
        # display(results_stats)

        output_dir = fr'Results COBEQ\Stats (W)\{pollutant}'
        os.makedirs(output_dir, exist_ok=True)
        results_stats.to_pickle(fr'Results COBEQ\Stats (W)\{pollutant}\{h}W_Df.pkl')

# **ANALYSIS**

In [None]:
df = pd.DataFrame()
for model in ['NHITS','NBEATS']:
    for H in [12,26,52,78]:
        x = joblib.load(fr'Results COBEQ/Stats (W)/pm10/{H}W_Df.pkl').sort_values(['smape','mae']).reset_index()
        metrics = ['smape', 'mae', 'mape', 'mse', 'max']
        for metric in metrics:
            x[f'{metric}_rank'] = x[metric].rank(method='min')

        # If you want best = 1, worst = 21, and ranks to be integers
        x[[f'{m}_rank' for m in metrics]] = x[[f'{m}_rank' for m in metrics]].astype(int)
        df = pd.concat([df, x[x['method'] == model]])
        del df['index']
df.to_excel(r'x.xlsx')

Unnamed: 0,pollutant,method,freq,h,mape,smape,max,mae,mse,smape_rank,mae_rank,mape_rank,mse_rank,max_rank
11,pm10,NHITS,W,12,21.61003,15.47986,12.52267,3.15901,20.76394,12,12,12,12,12
10,pm10,NHITS,W,26,23.89276,21.12368,25.26393,5.52537,58.60768,11,4,3,11,14
6,pm10,NHITS,W,52,20.25154,18.99953,15.63252,4.86126,39.63625,7,2,4,1,7
11,pm10,NHITS,W,78,29.27972,25.55316,30.36795,6.92467,91.43602,12,12,12,12,1
12,pm10,NBEATS,W,12,23.12074,16.27287,12.99588,3.31098,23.21854,13,13,13,13,13
9,pm10,NBEATS,W,26,24.88597,21.07951,15.33145,5.5996,54.09961,10,7,7,6,8
9,pm10,NBEATS,W,52,21.21281,19.80284,16.39186,5.01307,43.11202,10,7,8,6,9
12,pm10,NBEATS,W,78,31.24361,28.22053,36.15615,7.37499,104.68935,13,13,14,13,18


In [None]:
df = pd.DataFrame()
for model in ['NHITS','NBEATS']:
    for H in [12,26,52,78]:
        x = joblib.load(fr'C:\Users\gustavo.filho\Documents\Python\GitHub\TCC\Results COBEQ\{model} (W)\pm10\{H}W_Df.pkl').sort_values(['smape','mae']).reset_index()
        df = pd.concat([df, pd.DataFrame(x.iloc[0,:]).T])

df.to_excel(r'x.xlsx')

In [None]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, AutoCES, AutoETS, AutoTheta

results_stats = []
for pollutant in ['pm10']:
    for h in [12, 26, 52, 78]:

        obj = TimeObject(data, pollutant, agg_freq='D')
        obj.NIXTLA_train_test(split=h)

        season_length = 365 # Monthly data 
        horizon = len(obj.Y_train) # number of predictions

        models = [
            # AutoARIMA(season_length=season_length, alias='AutoARIMA'),
            AutoCES(season_length=season_length, model='Z', alias='AutoCES-Z'),
            AutoCES(season_length=season_length, model='S', alias='AutoCES-S'),
            AutoCES(season_length=season_length, model='P', alias='AutoCES-P'),
            AutoCES(season_length=season_length, model='N', alias='AutoCES-N'),
            AutoTheta(season_length=season_length, decomposition_type="multiplicative", alias='AutoTheta-Multi'),
            AutoTheta(season_length=season_length, decomposition_type="additive", alias='AutoTheta-Add'),
        ]
        models = models + [
            AutoETS(season_length=season_length, model=ets, alias=f'AutoETS-{ets}')
            for ets in [f"{e}{t}{s}" for e in ['Z',]
                        for t in ['Z', 'A', 'N'] 
                        for s in ['Z', 'A', 'M', 'N'] 
        ]]

        frct = StatsForecast(models=models, freq='D')
        frct.fit(df=obj.Y_train)
        predicted = frct.predict(h=h)

        columns = predicted.columns
        columns = columns[(columns != 'ds') & (columns != 'unique_id')]

        results_stats = pd.DataFrame()
        for method in columns:
            obj.plot_forecast(predicted, show_metrics=True, method=method)

In [None]:
for pollutant in ['pm10']:
    for h in [12, 26, 52, 78]:

        obj = TimeObject(data, pollutant, agg_freq='D')
        obj.NIXTLA_train_test(split=h)
        horizon = len(obj.Y_train) # number of predictions

        # ======================================================================================================

        nbeats = joblib.load(fr"Results COBEQ\NBEATS (W)\{pollutant}\{h}W_Study.pkl")
        model = NBEATS(
            h=h,
            input_size=nbeats.best_trials[0].params.get('input_size'),
            stack_types=nbeats.best_trials[0].params.get('interpretability')+(nbeats.best_trials[0].params.get('n_stacks')-len(nbeats.best_trials[0].params.get('interpretability')))*['identity'],
            n_blocks=nbeats.best_trials[0].params.get('n_stacks') * [nbeats.best_trials[0].params.get('n_blocks')],
            max_steps=nbeats.best_trials[0].params.get('max_steps'),
            learning_rate=1e-3,
            val_check_steps=10,
        )
        fcst = NeuralForecast(
            models=[model],
            freq='D',
            local_scaler_type=nbeats.best_trials[0].params.get('local_scalar_type')
        )
        fcst.fit(df=obj.Y_train, verbose=False)
        predicted = fcst.predict(df=obj.Y_train, verbose=False)
        obj.plot_forecast(predicted, method='NBEATS', show_metrics=True)

        # ======================================================================================================
        
        nhits = joblib.load(fr"Results COBEQ\NHITS (W)\{pollutant}\{h}W_Study.pkl")
        model = NHITS(
            h=h,
            input_size=nhits.best_trials[0].params.get('input_size'),
            stack_types=nhits.best_trials[0].params.get('n_stacks')*['identity'],
            n_freq_downsample=nhits.best_trials[0].params.get('n_freq_downsample')+(nhits.best_trials[0].params.get('n_stacks')-len(nhits.best_trials[0].params.get('n_freq_downsample')))*[1],
            n_blocks=nhits.best_trials[0].params.get('n_stacks')*[nhits.best_trials[0].params.get('n_blocks')],
            n_pool_kernel_size=(nhits.best_trials[0].params.get('n_stacks')-len(nhits.best_trials[0].params.get('n_pool_kernel_size')))*[1]+nhits.best_trials[0].params.get('n_pool_kernel_size'),
            pooling_mode="MaxPool1d",
            activation="ReLU",
            interpolation_mode='linear',
            max_steps=nhits.best_trials[0].params.get('max_steps'),
            val_check_steps=10,
        )
        fcst = NeuralForecast(
            models=[model],
            freq='D',
            local_scaler_type=nhits.best_trials[0].params.get('local_scalar_type')
        )
        fcst.fit(df=obj.Y_train, verbose=False)
        predicted = fcst.predict(df=obj.Y_train, verbose=False)
        obj.plot_forecast(predicted, show_metrics=True)