In [15]:
# Dependencies
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error

In [21]:
# Load data
data = pd.read_csv('transformed_data.csv', index_col=0, parse_dates=True)
# Set frequency as monthly
data = data.asfreq('MS').dropna()

In [22]:
# Split into train/test
PERIODS = 6
train_data = data[:-PERIODS].copy()
test_data = data[-PERIODS:].copy()

In [24]:
# Correlation matrix
corr = data.corr()
cpi_corr = corr['CPIAUCSL'].sort_values(ascending=False)

f = open('feature_selection.csv', 'w')
# Write header
f.write("Features,Lags,RMSE,Relative_RMSE\n")

# Select varying number of features
for num_features in range(2, 15):
    # Select features based on correlation
    corr_data = data[cpi_corr.index[0: num_features]]

    # Split into train/test
    periods = 6 # test using the last 6 months
    train_data = corr_data[:-periods].copy()
    test_data = corr_data[-periods:].copy()

    # Train the VAR model
    var_model = VAR(train_data)
    selected_lags = var_model.select_order()
    best_lag = selected_lags.aic  # Choose based on AIC or BIC
    var_results = var_model.fit(best_lag)

    # Forecast 
    forecast = var_results.forecast(train_data.values, steps=PERIODS)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(test_data['CPIAUCSL'], forecast[:, 0]))
    print(f"Number of features: {num_features}, RMSE: {rmse}, Lags: {best_lag}")
    f.write(f"{num_features},{best_lag},{rmse},{rmse / test_data['CPIAUCSL'].mean() * 100:.2f}\n")

# Close the file
f.close()


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Number of features: 2, RMSE: 0.0017294340254425942, Lags: 9
Number of features: 3, RMSE: 0.0017640465752622122, Lags: 10
Number of features: 4, RMSE: 0.0026669710114220883, Lags: 17


  self._init_dates(dates, freq)


Number of features: 5, RMSE: 0.0014529727833683282, Lags: 13


  self._init_dates(dates, freq)


Number of features: 6, RMSE: 0.0013726753926005, Lags: 13


  self._init_dates(dates, freq)


Number of features: 7, RMSE: 0.0014102194374305478, Lags: 13


  self._init_dates(dates, freq)


Number of features: 8, RMSE: 0.0020456586290927693, Lags: 12


  self._init_dates(dates, freq)


Number of features: 9, RMSE: 0.002065248858324948, Lags: 11


  self._init_dates(dates, freq)


Number of features: 10, RMSE: 0.0032837672882796206, Lags: 17


  self._init_dates(dates, freq)


Number of features: 11, RMSE: 0.002950723842698275, Lags: 17


  self._init_dates(dates, freq)


Number of features: 12, RMSE: 0.0027223366220578147, Lags: 17


  self._init_dates(dates, freq)


Number of features: 13, RMSE: 0.0023978996317057704, Lags: 17


  self._init_dates(dates, freq)


Number of features: 14, RMSE: 0.0019752732737785177, Lags: 17
