O objetivo deste arquivo é conseguir as estatísticas de importância de variáveis para cada modelo

In [1]:
# Bibliotecas
import pandas as pd
import numpy as np
import utils
import epftools as epf
import warnings
# Imports específicos
from sktime.split import ExpandingWindowSplitter, temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError, MeanAbsoluteError, MeanSquaredError
from sktime.forecasting.compose import make_reduction
from sklearn.preprocessing import StandardScaler
from sktime.forecasting.compose import RecursiveTabularRegressionForecaster
# Modelos
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
# Variáveis globais
FORECASTING_HORIZON = 1 # Deve ser a única coisa a mudar entre todos os arquivos h{i}.ipynb
# Estimador sendo ajustado
CURRENT_ESTIMATOR = 'lgbm'
TEST_SIZE = 79
WINDOW_LENGTH = 12
LINEAR_MODELS = ['ridge', 'lasso'] # Modelos cujos dados precisam ser normalizados
N_JOBS = 4
# Parâmetros específicos de modelos
# LASSO
LASSO_CONFIG = {
    'max_iter': 2000,
    'n_jobs': N_JOBS
}
# Random Forest
RF_CONFIG = {
    'n_estimators': 1000,
    'criterion': 'absolute_error',
    'max_features': 1.0
}
# LGBM
LGBM_CONFIG = {
    'n_jobs': N_JOBS,
    'verbose': -1,
    'force_col_wise': True,
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'objective': 'regression_l1'
}

In [3]:
# Importa previsões originais e dados de inflação
# Importa previsões originais
og_preds = utils.read_and_change_index(f"previsões/h{FORECASTING_HORIZON}.csv")
# Importa dados de inflação
y_true = utils.read_and_change_index("dados/dados_arima.csv")
_, y_true = temporal_train_test_split(y_true, test_size=79 - FORECASTING_HORIZON + 1)
og_preds['y_true'] = y_true
og_preds.head()

Unnamed: 0_level_0,ridge,lasso,random_forest,lgbm,arima,y_true
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-03,0.658977,0.404575,0.35649,0.410934,0.28251,0.25
2017-04,0.660869,0.408645,0.38057,0.385672,0.401124,0.14
2017-05,0.14534,0.108103,0.239,0.229672,0.309435,0.31
2017-06,-0.06434,0.207424,0.22261,0.281583,0.281415,-0.23
2017-07,-0.154144,0.044817,0.2402,0.248951,0.056204,0.24


In [4]:
# Importa dados para previsões
df = utils.read_and_change_index("dados/dados_transformados.csv")
df.head()

Unnamed: 0_level_0,ipca,ipca_15,selic,m1,m2,m3,m4,ibovespa,cambio,cambio_real,...,result_primario,el_nino,epu_br,epu_usa,ind_conf_cons,desemprego,rendimento,crb_cmdt,crb_food,crb_metal
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01,0.76,0.68,-0.548387,-11559013.0,-22319905.0,-18028683.0,-17053471.0,-384.95,-0.0735,-0.363,...,2481.49,0.4,49.609174,-2.34359,8.9,14.696341,348.7366,7.7,18.7,15.1
2004-02,0.61,0.9,0.0,1650795.0,11117714.0,1275340.0,-2131936.0,-96.42,0.0785,1.699,...,29.0,0.3,121.394093,1.0,-0.8,15.073171,0.0,10.11,14.74,24.27
2004-03,0.47,0.4,-0.112903,-1575433.0,-1282702.0,300227.0,-4199993.0,387.24,-0.0248,-2.777,...,2375.73,0.2,-166.416169,-5.0,-10.1,16.078049,-10382.5684,4.04,2.71,10.95
2004-04,0.37,0.21,-0.27043,539382.0,-378329.0,-4736024.0,-2778970.0,-2535.03,0.0005,-1.562,...,-1911.68,0.2,21.276658,-4.0,-5.3,16.454878,-339.4192,-0.45,6.35,-11.13
2004-05,0.51,0.54,-0.116667,1252881.0,13051576.0,26890443.0,11143701.0,-62.56,0.1944,4.213,...,1488.9,0.2,14.892162,17.0,16.2,15.32439,238.259,-5.31,-15.02,8.03


In [5]:
# Roda o LGBM
if CURRENT_ESTIMATOR == 'lgbm':
    lgbm = LGBMRegressor(**LGBM_CONFIG)
    forecaster = make_reduction(lgbm, window_length=WINDOW_LENGTH)

In [6]:
# Roda o modelo selecionado
y = df['ipca']
X = df.drop(columns=['ipca'])
y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=TEST_SIZE)
fh = ForecastingHorizon(FORECASTING_HORIZON, is_relative=True)
forecaster.fit(y=y_train, X=X_train, fh=fh)

In [7]:
def update_predict_loop(
        forecaster: RecursiveTabularRegressionForecaster,
        y: pd.Series,
        X: pd.DataFrame,
        fh: ForecastingHorizon,
        starting_period=pd.Period('2017-02', freq='M')
    ) -> pd.Series:
    current_period = starting_period
    last_period = y.index[-1]
    pred_series = forecaster.predict(X=X, fh=fh)
    current_period += 1
    while current_period < last_period:
        with warnings.catch_warnings():
            # Suprime aviso que o forecaster resultante de redução não tem método "update"
            # Não é importante no nosso caso
            warnings.simplefilter("ignore", category=UserWarning)
            forecaster.update(y.loc[[current_period]], X.loc[[current_period]])
        new_pred = forecaster.predict(fh, X)
        pred_series = pd.concat([pred_series, new_pred])
        current_period += 1
    return pred_series

In [8]:
y_pred = forecaster.predict(X=X, fh=fh)
y_pred

2017-03    0.410934
Freq: M, Name: ipca, dtype: float64

In [9]:
current_period = pd.Period('2017-03', freq='M')
y.loc[[current_period]]

month
2017-03    0.25
Freq: M, Name: ipca, dtype: float64

In [10]:
y_pred = update_predict_loop(forecaster, y, X, fh)
y_pred

2017-03    0.410934
2017-04    0.385672
2017-05    0.229672
2017-06    0.281583
2017-07    0.248951
             ...   
2023-05    0.580824
2023-06    0.222725
2023-07    0.236006
2023-08    0.279189
2023-09    0.308506
Freq: M, Name: ipca, Length: 79, dtype: float64

In [12]:
# Define a função que retorna DF com importâncias de variáveis
def get_feat_df(
    cryptic_var_names: list[str],
    df_var_names: list[str],
    cryptic_feat_importances: list[float],
) -> pd.DataFrame:
    """
    Extrai importâncias dos do modelo e converte para 'DataFrame' legível.
    Testado apenas para o LGBM.
    Criado para ser usado com o LGBM e o Random Forest.
    Essa função ficou uma bagunça. Já estou no final do TCC e não me importo tanto com beleza.
    Daria para remover todas as subfunções.

    Parameters
    ----------

    cryptic_var_names: list[str]
        Lista do nome de variáveis retornada pela API do sklearn.
        Obtido do LGBM com o método `forecaster.estimator_.booster_.feature_name()`.

    df_var_names: list[str]
        Lista de variáveis no 'DataFrame' usado para as previsões.
        Retornado por `df.columns`.

    cryptic_feat_importances; list[float]
        Lista de importâncias numéricas.
        Retornado no LGBM por `forecaster.estimator_.booster_.feature_importance(importance_type='gain')`.

    Returns
    -------

    pd.DataFrame:
        'DataFrame' com índices representando cada variável e valor representando a %
        de ganho que se deve àquela variável.
    """
    def extract_feature_names(cryptic_var_names, df_var_names) -> list[str]:
        translated_feature_names = []
        for name in cryptic_var_names:
            _, n = name.split('_')
            n = int(n)
            translated_feature_names.append(f"{df_var_names[n % 41]}_{int(n / 41)}")
        return translated_feature_names
    
    def get_feature_importance_dict(cryptic_feat_importances, feat_names):
        return {name: importance for name, importance in zip(feat_names, cryptic_feat_importances)}
    
    def condense_feat_importances(var_names: list[str], feat_importances: dict[str, float]) -> pd.DataFrame:
        # Cria dicionário com valor absoluto de importâncias para cada variável
        # independente de lags
        added_importances = {}
        for feat in var_names:
            added_importances[feat] = 0
            for k, v in feat_importances.items():
                if feat in k:
                    added_importances[feat] += v
        total = 0
        for k, v in added_importances.items():
            total += v

        # Converte para dicionário de porcentagens
        final_importances = {k: v / total for k, v in added_importances.items()}
        final_importances = pd.DataFrame.from_dict(final_importances, orient='index')
        final_importances.columns = ['values']
        final_importances = final_importances.sort_values(by='values', ascending=False)
        return final_importances
    
    translated_feat_names = extract_feature_names(cryptic_var_names, df_var_names)
    translated_feat_importances = get_feature_importance_dict(
    cryptic_feat_importances,
    translated_feat_names
    )
    return condense_feat_importances(
        df_var_names,
        translated_feat_importances
    )

In [13]:
y[[-1]]

  y[[-1]]


month
2023-09    0.26
Freq: M, Name: ipca, dtype: float64

In [12]:
og_preds['lgbm'].head()

month
2017-03    0.410934
2017-04    0.385672
2017-05    0.229672
2017-06    0.281583
2017-07    0.248951
Freq: M, Name: lgbm, dtype: float64