In [5]:
# Dependencies
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load data (FRED-MD)
df = pd.read_csv('../current.csv')

# Remove the first row (transformation codes)
transformation_codes = df.iloc[0]  # Transformation codes can be applied if needed
df = df.iloc[1:]

# Set the first column as the index and datetime
df.set_index(df.columns[0], inplace=True)
df.index = pd.to_datetime(df.index)

# Dropna
data = df.dropna()

# Create train data and target
target = (data['CPIAUCSL'].diff(12) / data['CPIAUCSL'].shift(12)) * 100
target = target.shift(-12).dropna()


In [8]:
for forecast_horizon in range(4, 12 * 5 + 4, 12):
    variance = 0.95
    # Create train/test split
    cutoff = len(data) - forecast_horizon
    train_data = data.iloc[:cutoff]  # assume series is a DataFrame
    # Fit PCA
    pca = PCA(n_components=variance)  # Keep 95% of variance
    pca.fit(train_data)
    train = pd.DataFrame(pca.transform(train_data), index=train_data.index)
    train["CPIAUCSL"] = target.iloc[:cutoff].values
    train = train.dropna()
    # Fit VAR model
    var_model = VAR(train)
    # Generate AIC and BIC suggestions
    optimal_lags = var_model.select_order()
    # Go through the lags
    for context_window in [optimal_lags.aic, optimal_lags.bic]:
        # Split data into train and test sets
        results = var_model.fit(context_window)

        # Generate forecast
        forecast = results.forecast(train.values[-context_window:], steps=forecast_horizon)

        # Inverse PCA transformation
        forecast = pca.inverse_transform(forecast)
        forecast_df = pd.DataFrame(forecast, columns=train.columns)
        forecast_df.index = target.index[-forecast_horizon:]
        forecast_df = forecast_df.dropna()

        # Get the true values
        y_true = target.iloc[cutoff:].values
        y_pred = forecast_df['CPIAUCSL'].values

        # Calculate errors
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)

        # Log
        fp = "var.csv"
        var_log = pd.read_csv(fp) # forecast_horizon,context_window,dropout_rate,rmse,mae,forecast,true_vals
        new_row = {
            'forecast_horizon': forecast_horizon,
            'context_window': context_window,
            'variance': variance,
            'rmse': rmse,
            'mae': mae,
            'forecast': list(y_pred),
            'actual': list(y_true)
        }
        var_log = pd.concat([var_log, pd.DataFrame([new_row])], ignore_index=True)
        var_log.to_csv(fp, index=False)

ValueError: Length of values (380) does not match length of index (388)