In [2]:
import pandas as pd
import numpy as np
import optuna
from rich.jupyter import display
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor

#### Xgboost (Ensemble) -> WRMSE = 0.0655
- Leitura do dataset
- Criação dos lags
- métrica de controle geral

In [3]:
df = pd.read_excel('../Input/DadosCompeticao.xlsx')

In [4]:
def create_lag_features(series, lags):
    df = pd.DataFrame({'y': series})
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag)
    return df.dropna()

In [3]:
wrmse = lambda rmse: (rmse[:11] * (1 / 11)).sum()

#### Otimização
$$ min\ score = \sqrt{ \sum \dfrac{(y_{pred}-y)^2}{n}}$$
- Função **objective** -> set(rmse)
- Função **optimize** -> set(best_params to fit)

In [10]:
forecast_window = 12
past_window = df.shape[1] - 1

In [16]:
def objective(trial, series, forecast_window, past_window, n_splits=3):
    data = create_lag_features(series, past_window)
    X = data.drop('y', axis=1).values
    y = data['y'].values

    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmse_scores = []

    for train_idx, test_idx in tscv.split(X):
        X_train, X_val = X[train_idx], X[test_idx]
        y_train, y_val = y[train_idx], y[test_idx]

        model = XGBRegressor(
            n_estimators=trial.suggest_int('n_estimators', 100, 500),
            max_depth=trial.suggest_int('max_depth', 3, 10),
            learning_rate=trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse_scores.append(root_mean_squared_error(y_val, y_pred))

    return np.mean(rmse_scores)

In [21]:
def optimize(series, forecast_horizon=forecast_window, past_window=past_window):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, series, forecast_horizon, past_window), n_trials=50) ## -> +combinações = -performance de treinamento
    return study.best_params

#### Predict and Forecast

In [26]:
def forecasting(series, best_params, forecast_horizon=12, past_window=12):
    data = create_lag_features(series, past_window) #features: lags
    X = data.drop('y', axis=1).values
    y = data['y'].values

    model = XGBRegressor(**best_params) ## -> Spread de parâmetros
    model.fit(X, y)

    forecast = []
    last_window = list(series[-past_window:])
    actuals = series[-forecast_horizon:]

    for _ in range(forecast_horizon):
        input_array = np.array(last_window[-past_window:]).reshape(1, -1)
        next_value = model.predict(input_array)[0]
        forecast.append(next_value)
        last_window.append(next_value)

    rmse = root_mean_squared_error(actuals, forecast)
    return forecast, rmse

In [29]:
def run(df, forecast_horizon=forecast_window, past_window=past_window):
    forecasts = {}
    scores = {}

    for col in df.columns:
        print(f"===== Série: {col} ====")
        series = df[col].dropna().values
        best_params = optimize(series, forecast_horizon, past_window)
        forecast, rmse = forecasting(series, best_params, forecast_horizon, past_window)
        forecasts[col] = forecast
        scores[col] = rmse

    return forecasts, scores

In [None]:
forecasts, scores = run(df)

In [31]:
df_forecast = pd.DataFrame(forecasts)
df_scores = pd.DataFrame.from_dict(scores, orient='index', columns=['RMSE'])

In [33]:
df_forecast.to_excel('../Output/v5/previsoes_xgboost.xlsx', index=False)
df_scores.to_excel('../Output/v5/scores_xgboost.xlsx', index=False)

In [39]:
print(f'WRMSE: {wrmse(df_scores["RMSE"]): .4f}')

WRMSE:  0.0655
