O objetivo deste arquivo é conseguir as estatísticas de importância de variáveis para cada modelo

In [1]:
# Bibliotecas
import pandas as pd
import numpy as np
import utils
import warnings
import sys
import forecasting_vars as fvars
# Imports específicos
from sklearn.exceptions import ConvergenceWarning
from sktime.forecasting.compose import make_reduction
# Modelos
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
# Variáveis globais
FORECASTING_HORIZONS = [1, 2, 3, 6, 12]
VAR_IMPORTANCE_DIR = "var_importance"
# Estimador sendo ajustado
CURRENT_ESTIMATOR = 'lasso'
if "forecasting_vars" not in sys.modules:
    TEST_SIZE = 79
    WINDOW_LENGTH = 12
    LINEAR_MODELS = ['ridge', 'lasso'] # Modelos cujos dados precisam ser normalizados
    N_JOBS = 4
    # Parâmetros específicos de modelos
    # LASSO
    LASSO_CONFIG = {
        'max_iter': 1,
        'n_jobs': N_JOBS
    }
    # Random Forest
    RF_CONFIG = {
        'n_estimators': 1,
        'max_features': "sqrt"
    }
    # LGBM
    LGBM_CONFIG = {
        'n_jobs': N_JOBS,
        'verbose': -1,
        'force_col_wise': True,
        'n_estimators': 1,
    }

else:
    TEST_SIZE = fvars.TEST_SIZE
    WINDOW_LENGTH = fvars.WINDOW_LENGTH
    LINEAR_MODELS = fvars.LINEAR_MODELS
    N_JOBS = fvars.N_JOBS
    LASSO_CONFIG = fvars.lasso
    RF_CONFIG = fvars.rf
    LGBM_CONFIG = fvars.lgbm

In [3]:
# Importa dados para previsões
df = utils.read_and_change_index("dados/dados_transformados.csv")
df.head()

Unnamed: 0_level_0,ipca,ipca_15,selic,m1,m2,m3,m4,ibovespa,cambio,cambio_real,...,result_primario,el_nino,epu_br,epu_usa,ind_conf_cons,desemprego,rendimento,crb_cmdt,crb_food,crb_metal
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01,0.76,0.68,-0.548387,-11559013.0,-22319905.0,-18028683.0,-17053471.0,-384.95,-0.0735,-0.363,...,2481.49,0.4,49.609174,-2.34359,8.9,14.696341,348.7366,7.7,18.7,15.1
2004-02,0.61,0.9,0.0,1650795.0,11117714.0,1275340.0,-2131936.0,-96.42,0.0785,1.699,...,29.0,0.3,121.394093,1.0,-0.8,15.073171,0.0,10.11,14.74,24.27
2004-03,0.47,0.4,-0.112903,-1575433.0,-1282702.0,300227.0,-4199993.0,387.24,-0.0248,-2.777,...,2375.73,0.2,-166.416169,-5.0,-10.1,16.078049,-10382.5684,4.04,2.71,10.95
2004-04,0.37,0.21,-0.27043,539382.0,-378329.0,-4736024.0,-2778970.0,-2535.03,0.0005,-1.562,...,-1911.68,0.2,21.276658,-4.0,-5.3,16.454878,-339.4192,-0.45,6.35,-11.13
2004-05,0.51,0.54,-0.116667,1252881.0,13051576.0,26890443.0,11143701.0,-62.56,0.1944,4.213,...,1488.9,0.2,14.892162,17.0,16.2,15.32439,238.259,-5.31,-15.02,8.03


In [4]:
# Roda modelos para todos os horizontes
for h in FORECASTING_HORIZONS:
    # Salva arquivo Excel com métricas selecionadas
    with pd.ExcelWriter(f"{VAR_IMPORTANCE_DIR}/h{h}.xlsx") as writer:
        print(f"h = {h}")
        # Salva previsões para cada horizonte para checar
        preds = {}
        # Instancia modelos para cada horizonte
        regressors = {
            "ridge": RidgeCV(),
            "lasso": LassoCV(**LASSO_CONFIG),
            "random_forest": RandomForestRegressor(**RF_CONFIG),
            "lgbm": LGBMRegressor(**LGBM_CONFIG),
        }
        forecasters = {k: make_reduction(v, window_length=WINDOW_LENGTH) for k, v in regressors.items()}
        # Roda todos os modelos
        for name, forecaster in forecasters.items():
            print(f"Rodando {name}")
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=ConvergenceWarning)
                preds[name] = utils.run_model(forecaster, name, df, h, TEST_SIZE)
            feat_importances = utils.get_feat_df(forecaster, df.columns, name)
            feat_importances.to_excel(writer, name)
        # Salva previsões no Excel
        preds_df = pd.DataFrame(preds, index=preds[name].index)
        preds_df.to_csv(f"{VAR_IMPORTANCE_DIR}/check_h{h}.csv", sep=";", decimal=",")

h = 1
Rodando ridge
Rodando lasso
Rodando random_forest
Rodando lgbm
h = 2
Rodando ridge
Rodando lasso
Rodando random_forest
Rodando lgbm
h = 3
Rodando ridge
Rodando lasso
Rodando random_forest
Rodando lgbm
h = 6
Rodando ridge
Rodando lasso
Rodando random_forest
Rodando lgbm
h = 12
Rodando ridge
Rodando lasso
Rodando random_forest
Rodando lgbm
