O objetivo deste arquivo é experimentar diferentes parâmetros para rodar os modelos,
tentando obter resultados melhores

In [1]:
import pandas as pd
import numpy as np
import utils
from sktime.split import ExpandingWindowSplitter, temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
import epftools as epf
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError, MeanAbsoluteError, MeanSquaredError
from sktime.forecasting.compose import make_reduction
from sklearn.preprocessing import StandardScaler
# Modelos
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
# Variáveis globais
FORECASTING_HORIZON = 1 # Deve ser a única coisa a mudar entre todos os arquivos h{i}.ipynb
TEST_SIZE = 79
WINDOW_LENGTH = 12
LINEAR_MODELS = ['ridge', 'lasso'] # Modelos cujos dados precisam ser normalizados
N_JOBS = 4
# Estimador sendo ajustado
CURRENT_ESTIMATOR = 'lgbm'
# Parâmetros específicos de modelos
# LASSO
LASSO_CONFIG = {
    'max_iter': 2000,
    'n_jobs': N_JOBS
}
# Random Forest
RF_CONFIG = {
    'n_estimators': 1000,
    'criterion': 'absolute_error',
    'max_features': 1.0
}
# LGBM
LGBM_CONFIG = {
    'n_jobs': N_JOBS,
    'verbose': -1,
    'force_col_wise': True,
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'objective': 'regression_l1'
}

In [3]:
# Importa previsões originais
og_preds = utils.read_and_change_index(f"previsões/h{FORECASTING_HORIZON}.csv")
# Importa dados de inflação
y_true = utils.read_and_change_index("dados/dados_arima.csv")
_, y_true = temporal_train_test_split(y_true, test_size=79 - FORECASTING_HORIZON + 1)
og_preds['y_true'] = y_true
og_preds.head()

Unnamed: 0_level_0,ridge,lasso,random_forest,lgbm,arima,y_true
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-03,0.658977,0.404575,0.34446,0.455176,0.28251,0.25
2017-04,0.660869,0.408645,0.3988,0.406985,0.401124,0.14
2017-05,0.14534,0.108103,0.21552,0.140452,0.309435,0.31
2017-06,-0.06434,0.207424,0.19631,0.164237,0.281415,-0.23
2017-07,-0.154144,0.044817,0.22824,0.169523,0.056204,0.24


In [4]:
# Importa dados para previsões
df = utils.read_and_change_index("dados/dados_transformados.csv")
df.head()

Unnamed: 0_level_0,ipca,ipca_15,selic,m1,m2,m3,m4,ibovespa,cambio,cambio_real,...,result_primario,el_nino,epu_br,epu_usa,ind_conf_cons,desemprego,rendimento,crb_cmdt,crb_food,crb_metal
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01,0.76,0.68,-0.548387,-11559013.0,-22319905.0,-18028683.0,-17053471.0,-384.95,-0.0735,-0.363,...,2481.49,0.4,49.609174,-2.34359,8.9,14.696341,348.7366,7.7,18.7,15.1
2004-02,0.61,0.9,0.0,1650795.0,11117714.0,1275340.0,-2131936.0,-96.42,0.0785,1.699,...,29.0,0.3,121.394093,1.0,-0.8,15.073171,0.0,10.11,14.74,24.27
2004-03,0.47,0.4,-0.112903,-1575433.0,-1282702.0,300227.0,-4199993.0,387.24,-0.0248,-2.777,...,2375.73,0.2,-166.416169,-5.0,-10.1,16.078049,-10382.5684,4.04,2.71,10.95
2004-04,0.37,0.21,-0.27043,539382.0,-378329.0,-4736024.0,-2778970.0,-2535.03,0.0005,-1.562,...,-1911.68,0.2,21.276658,-4.0,-5.3,16.454878,-339.4192,-0.45,6.35,-11.13
2004-05,0.51,0.54,-0.116667,1252881.0,13051576.0,26890443.0,11143701.0,-62.56,0.1944,4.213,...,1488.9,0.2,14.892162,17.0,16.2,15.32439,238.259,-5.31,-15.02,8.03


In [5]:
# Roda o LASSO
if CURRENT_ESTIMATOR == 'lasso':
    lasso = LassoCV(**LASSO_CONFIG)
    forecaster = make_reduction(lasso, window_length=WINDOW_LENGTH)
    # Normaliza dados
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(df)
    normalized_df = pd.DataFrame(normalized_data, columns=df.columns, index=df.index)
    y_train, y_test, y_pred = utils.evaluate_pipeline(forecaster, normalized_df, steps_ahead=FORECASTING_HORIZON)
    # Desnormaliza previsões
    y_pred = y_pred * np.sqrt(scaler.var_[0]) + scaler.mean_[0]

In [6]:
# Roda o Random Forest
if CURRENT_ESTIMATOR == 'random_forest':
    rf = RandomForestRegressor(**RF_CONFIG)
    forecaster = make_reduction(rf, window_length=WINDOW_LENGTH)
    y_train, y_test, y_pred = utils.evaluate_pipeline(forecaster, df, steps_ahead=FORECASTING_HORIZON)

In [7]:
# Roda o LGBM
if CURRENT_ESTIMATOR == 'lgbm':
    lgbm = LGBMRegressor(**LGBM_CONFIG)
    forecaster = make_reduction(lgbm, window_length=WINDOW_LENGTH)
    #y_train, y_test, y_pred = utils.evaluate_pipeline(forecaster, df, steps_ahead=FORECASTING_HORIZON)

In [8]:
"""
# Calcula métricas de erro
# Evita avisos chatos
og_preds['y_true'].name = y_pred.name
# Resultados
mae = MeanAbsoluteError()
mse = MeanSquaredError()
mape = MeanAbsolutePercentageError()
labels = ['MAE', 'MSE', 'MAPE', 'DM']
res = {
    f'new_{CURRENT_ESTIMATOR}': [
        mae(og_preds['y_true'], y_pred),
        mse(og_preds['y_true'], y_pred),
        mape(og_preds['y_true'], y_pred),
        epf.DM(og_preds['y_true'], og_preds['arima'], y_pred)
    ],
    f'old_{CURRENT_ESTIMATOR}': [
        mae(og_preds['y_true'], og_preds[CURRENT_ESTIMATOR]),
        mse(og_preds['y_true'], og_preds[CURRENT_ESTIMATOR]),
        mape(og_preds['y_true'], og_preds[CURRENT_ESTIMATOR]),
        epf.DM(og_preds['y_true'], og_preds['arima'], og_preds[CURRENT_ESTIMATOR])
    ]
}
res_df = pd.DataFrame(res, index=labels)
res_df
"""

"\n# Calcula métricas de erro\n# Evita avisos chatos\nog_preds['y_true'].name = y_pred.name\n# Resultados\nmae = MeanAbsoluteError()\nmse = MeanSquaredError()\nmape = MeanAbsolutePercentageError()\nlabels = ['MAE', 'MSE', 'MAPE', 'DM']\nres = {\n    f'new_{CURRENT_ESTIMATOR}': [\n        mae(og_preds['y_true'], y_pred),\n        mse(og_preds['y_true'], y_pred),\n        mape(og_preds['y_true'], y_pred),\n        epf.DM(og_preds['y_true'], og_preds['arima'], y_pred)\n    ],\n    f'old_{CURRENT_ESTIMATOR}': [\n        mae(og_preds['y_true'], og_preds[CURRENT_ESTIMATOR]),\n        mse(og_preds['y_true'], og_preds[CURRENT_ESTIMATOR]),\n        mape(og_preds['y_true'], og_preds[CURRENT_ESTIMATOR]),\n        epf.DM(og_preds['y_true'], og_preds['arima'], og_preds[CURRENT_ESTIMATOR])\n    ]\n}\nres_df = pd.DataFrame(res, index=labels)\nres_df\n"

In [16]:
y = df['ipca']
X = df.drop(columns=['ipca'])
y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=TEST_SIZE)
fh = ForecastingHorizon(FORECASTING_HORIZON, is_relative=True)
cv = ExpandingWindowSplitter(fh, initial_window=y_train.size)
forecaster.fit(y_train, X_train, fh=fh)
y_pred = forecaster.predict(fh=fh, X=X)

In [17]:
y_pred

2017-03    0.410934
Freq: M, Name: ipca, dtype: float64