In [12]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from statsmodels.tsa.api import VAR

In [13]:
# Load data (FRED-MD)
df = pd.read_csv('current.csv')

# Remove the first row (transformation codes)
transformation_codes = df.iloc[0]  # Transformation codes can be applied if needed
df = df.iloc[1:]

# Set the first column as the index and datetime
df.set_index(df.columns[0], inplace=True)
df.index = pd.to_datetime(df.index)

# Transform and standardize the data
df_logged = np.sign(df) * np.log1p(np.abs(df))

# Split into train and test sets
FORECAST_HORIZON = 6
train = df_logged[:-FORECAST_HORIZON]
test = df_logged[-FORECAST_HORIZON:]

# Standardize based on training data only to avoid data leakage
df_logged_train = (train - train.mean()) / train.std()

In [14]:
# Apply PCA on the training data
pca = PCA(n_components=0.95)
pca_data = pca.fit_transform(df_logged_train.dropna())
train_pca = pd.DataFrame(pca_data, index=df_logged_train.dropna().index)

# Also transform the test data using the same PCA transformation
test_pca = pca.transform(test.dropna())

# Add CPIAUCSL (dependent variable) to PCA transformed data
train_pca["CPIAUCSL"] = train["CPIAUCSL"].dropna()
test_pca = pd.DataFrame(test_pca, index=test.dropna().index)
test_pca["CPIAUCSL"] = test["CPIAUCSL"].dropna()

In [16]:
# Create the VAR Model using the transformed training data
var_model = VAR(train_pca)

# Select optimal lag using BIC (can also use AIC)
best_lag = var_model.select_order().bic
print(f"Optimal Lag: {best_lag}")

# Fit the model with the best lag order
var_results = var_model.fit(best_lag)

# Generate forecast using the most recent values
forecast = var_results.forecast(train_pca.values[-best_lag:], steps=FORECAST_HORIZON)

  self._init_dates(dates, freq)


Optimal Lag: 1


In [18]:
# Create DataFrame for the forecast
forecast_df = pd.DataFrame(forecast, columns=train_pca.columns, index=pd.date_range(start=test_pca.index[0], periods=FORECAST_HORIZON, freq='MS'))

# Transform CPIAUCSL back to original scale
inflation = np.sign(forecast_df["CPIAUCSL"]) * (np.exp(np.abs(forecast_df["CPIAUCSL"])) - 1)
inflation


2024-07-01    313.455011
2024-08-01    313.891820
2024-09-01    314.355226
2024-10-01    314.841480
2024-11-01    315.347252
2024-12-01    315.869599
Freq: MS, Name: CPIAUCSL, dtype: float64

In [21]:
# Calculate RMSE
rmse = np.sqrt(np.mean((inflation - test["CPIAUCSL"]) ** 2))
print(f"RMSE: {rmse:.2f}")
# Normalize RMSE
print(f"Normalized RMSE: {rmse / np.mean(test['CPIAUCSL']):.2f}%")

RMSE: 308.87
Normalized RMSE: 53.65%


With PCA:  
RMSE: 308.87  
Normalized RMSE: 53.65%  