# Import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
!pip install pmdarima
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA

from google.colab import drive
drive.mount('/content/drive')

# Data

In [None]:
# Load data
path = "/content/drive/MyDrive/Thesis/Data/Merging CleanPrice & Features.csv"
price = pd.read_csv(path, sep=',')

price['Date'] = pd.to_datetime(price['Date'])

# Dataframes and global settings

In [None]:
# Extract dataframe that only has our y, state name, and date
log_return = price[['Date', 'GEO_Name', 'Log_Return_h1']].dropna()

# Set index to the Date
log_return['Date'] = pd.to_datetime(log_return['Date'])
log_return.set_index('Date', inplace=True)

In [None]:
# The initial train size and max forecast length
initial_train_size = 330
max_forecast = 12

# Function

In [None]:
# Function to fit the ARIMA
def fit_arima_model(data, arima_order):
    model = ARIMA(data, order=arima_order, freq='MS')
    return model.fit()

# Predictions

In [None]:
# DataFrame to save the forecasts and actual values
all_forecasts = pd.DataFrame()
all_actuals = pd.DataFrame()

# DataFrame to log ARIMA orders
order_log = pd.DataFrame(columns=['State', 'Roll', 'p', 'd', 'q', 'AIC'])

# Loop through each unique state
for state in log_return['GEO_Name'].unique():
    print(f"Processing state: {state}")
    print("\n")
    state_data = log_return[log_return['GEO_Name'] == state]
    state_data = state_data.asfreq('MS')

    total_obs = len(state_data)
    forecast_matrix = []
    actual_matrix = []

    i = 0

    # Inner loop for forecasting per state
    while initial_train_size + i < total_obs:
        training_data = state_data.iloc[i:initial_train_size + i]

        # Recursive update of the order
        if i % 24 == 0:
            try:
                auto_model = auto_arima(training_data['Log_Return_h1'], start_p=1, start_q=0,
                                        max_p=12, max_q=0, d=0, seasonal=False,
                                        trace=False, error_action='ignore', suppress_warnings=True,
                                        stepwise=True, information_criterion='aic',
                                        n_jobs=1)
                order = auto_model.order
                p, d, q = order
                aic_score = auto_model.aic()
                print(f"Updated ARIMA order to (p={p}, d={d}, q={q}) with AIC {aic_score} at index {initial_train_size + i}")
                # Log the order and AIC score
                new_row = pd.DataFrame({
                    'State': [state],
                    'Roll': [initial_train_size + i],
                    'p': [p],
                    'd': [d],
                    'q': [q],
                    'AIC': [aic_score]
                })
                order_log = pd.concat([order_log, new_row], ignore_index=True)
            except Exception as e:
                print(f"Error updating ARIMA order at index {initial_train_size + i}: {e}")

        # Fit ARIMA model with the current order
        arima_model = fit_arima_model(training_data['Log_Return_h1'], order)

        # Determine forecast range dynamically
        forecast_steps = min(max_forecast, total_obs - (initial_train_size + i))

        # Forecast for the range
        forecast_result = arima_model.forecast(steps=forecast_steps)
        forecast_matrix.append(forecast_result.values)

        # Collect actual values
        actual_values = state_data['Log_Return_h1'].iloc[initial_train_size + i: initial_train_size + i + forecast_steps]
        actual_matrix.append(actual_values.values)

        i += 1

    # Collect forecasts and actuals for the current state
    forecast_df = pd.DataFrame(forecast_matrix, index=pd.RangeIndex(start=initial_train_size, stop=initial_train_size+len(forecast_matrix)))
    actual_df = pd.DataFrame(actual_matrix, index=pd.RangeIndex(start=initial_train_size, stop=initial_train_size+len(actual_matrix)))

    forecast_df['state'] = state
    actual_df['state'] = state

    all_forecasts = pd.concat([all_forecasts, forecast_df], ignore_index=False)
    all_actuals = pd.concat([all_actuals, actual_df], ignore_index=False)


In [None]:
order_log.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)_ORDER.csv")
order_log.to_excel("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)_ORDER.xlsx")

# Formatting dataframes and saving predictions and actual values

In [None]:
# Let's change the column names and the index

# We want the index to start at the train value, and just call it 'origin'
all_forecasts.index.name = 'Origin'

# Define a function to rename columns
def rename_columns(df):
    renamed_columns = []
    for col in df.columns:
        if col != 'state':
            if isinstance(col, int) and 0 <= col <= 11:
                renamed_columns.append('h' + str(col + 1))
            else:
                renamed_columns.append(col)
        else:
            renamed_columns.append(col)
    return renamed_columns

# Rename columns of all_forecasts and actuals DataFrame
all_forecasts.columns = rename_columns(all_forecasts)
all_actuals.columns = rename_columns(all_actuals)

In [None]:
all_forecasts.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)PredictionsFormat.csv")
all_actuals.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)ActualsFormat.csv")

path2 = "/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)ActualsFormat.csv"
path3 = "/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)PredictionsFormat.csv"

In [None]:
# Loading agian
Actuals_load = pd.read_csv(path2, sep=',')
Predictions_load = pd.read_csv(path3, sep=',')

columns = price.copy()
Actuals_load.rename(columns={'Unnamed: 0': 'Origin'}, inplace=True)

## Summing in these different dataframes and creating a new column that has the prediction for the horizon

In [None]:
# Create dataframes that we can get the different horizons from
forecast_h1 = Predictions_load[['state', 'h1', 'Origin']].dropna()
forecast_h3 = Predictions_load[['state', 'h1', 'h2', 'h3', 'Origin']].dropna()
forecast_h6 = Predictions_load[['state', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'Origin']].dropna()
forecast_h12 = Predictions_load.copy().dropna()

In [None]:
# h3
h3_sum = ['h1', 'h2', 'h3']
forecast_h3['Prediction_h3'] = forecast_h3[h3_sum].sum(axis=1)
columns_to_drop = ['h1', 'h2', 'h3']
forecast_h3 = forecast_h3.drop(columns=columns_to_drop)

# h6
h6_sum = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
forecast_h6['Prediction_h6'] = forecast_h6[h6_sum].sum(axis=1)
columns_to_drop = ['h1', 'h2', 'h3','h4', 'h5', 'h6']
forecast_h6 = forecast_h6.drop(columns=columns_to_drop)

# h12
h12_sum = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10', 'h11', 'h12']
forecast_h12['Prediction_h12'] = forecast_h12[h12_sum].sum(axis=1)
columns_to_drop = ['h1', 'h2', 'h3','h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10', 'h11', 'h12']
forecast_h12 = forecast_h12.drop(columns=columns_to_drop)

## Same for actuals

In [None]:
# Create dataframes that we can get the different horizons from
actuals_h1 = Actuals_load[['state', 'h1', 'Origin']].dropna()
actuals_h3 = Actuals_load[['state', 'h1', 'h2', 'h3', 'Origin']].dropna()
actuals_h6 = Actuals_load[['state', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'Origin']].dropna()
actuals_h12 = Actuals_load.copy().dropna()

In [None]:
# h3
h3_sum = ['h1', 'h2', 'h3']
actuals_h3['actuals_h3'] = actuals_h3[h3_sum].sum(axis=1)
columns_to_drop = ['h1', 'h2', 'h3']
actuals_h3 = actuals_h3.drop(columns=columns_to_drop)

# h6
h6_sum = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
actuals_h6['actuals_h6'] = actuals_h6[h6_sum].sum(axis=1)
columns_to_drop = ['h1', 'h2', 'h3','h4', 'h5', 'h6']
actuals_h6 = actuals_h6.drop(columns=columns_to_drop)

# h12
h12_sum = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10', 'h11', 'h12']
actuals_h12['actuals_h12'] = actuals_h12[h12_sum].sum(axis=1)
columns_to_drop = ['h1', 'h2', 'h3','h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10', 'h11', 'h12']
actuals_h12 = actuals_h12.drop(columns=columns_to_drop)

## Creating Dataframes with Actuals and Predictions

In [None]:
# h1
# Extract dataframe that only has our y, state name, and date
origins2 = price[['Date', 'GEO_Name']].dropna()
origins = origins2[origins2['GEO_Name'] == 'AK']
origins['PredictionDate'] = columns['Date']
origins_h1 = origins[['PredictionDate']].shift(-1).dropna().reset_index()
origins_h1.rename(columns={'index': 'Origin'}, inplace=True)

# h3
origins['PredictionDate'] = columns['Date']
origins_h3 = origins[['PredictionDate']].shift(-3).dropna().reset_index()
origins_h3.rename(columns={'index': 'Origin'}, inplace=True)

# h6
origins['PredictionDate'] = columns['Date']
origins_h6 = origins[['PredictionDate']].shift(-6).dropna().reset_index()
origins_h6.rename(columns={'index': 'Origin'}, inplace=True)

# h12
origins['PredictionDate'] = columns['Date']
origins_h12 = origins[['PredictionDate']].shift(-12).dropna().reset_index()
origins_h12.rename(columns={'index': 'Origin'}, inplace=True)

In [None]:
## Forecasts ##
# h1
prediction_h1 = pd.merge(forecast_h1, origins_h1, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
prediction_h1.drop(columns=['index', 'Origin'], inplace=True)

# h3
prediction_h3 = pd.merge(forecast_h3, origins_h3, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
prediction_h3.drop(columns=['index', 'Origin'], inplace=True)

# h6
prediction_h6 = pd.merge(forecast_h6, origins_h6, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
prediction_h6.drop(columns=['index', 'Origin'], inplace=True)

# h12
prediction_h12 = pd.merge(forecast_h12, origins_h12, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
prediction_h12.drop(columns=['index', 'Origin'], inplace=True)

In [None]:
## Actuals ##
# h1
Returns_h1 = pd.merge(actuals_h1, origins_h1, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
Returns_h1.drop(columns=['index', 'Origin'], inplace=True)

# h3
Returns_h3 = pd.merge(actuals_h3, origins_h3, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
Returns_h3.drop(columns=['index', 'Origin'], inplace=True)

# h6
Returns_h6 = pd.merge(actuals_h6, origins_h6, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
Returns_h6.drop(columns=['index', 'Origin'], inplace=True)

# h12
Returns_h12 = pd.merge(actuals_h12, origins_h12, on='Origin', how='inner').sort_values(by=['state', 'Origin']).reset_index()
Returns_h12.drop(columns=['index', 'Origin'], inplace=True)

In [None]:
# Setting indexes
Returns_h1.set_index(['state', 'PredictionDate'], inplace=True)
Returns_h3.set_index(['state', 'PredictionDate'], inplace=True)
Returns_h6.set_index(['state', 'PredictionDate'], inplace=True)
Returns_h12.set_index(['state', 'PredictionDate'], inplace=True)

prediction_h1.set_index(['state', 'PredictionDate'], inplace=True)
prediction_h3.set_index(['state', 'PredictionDate'], inplace=True)
prediction_h6.set_index(['state', 'PredictionDate'], inplace=True)
prediction_h12.set_index(['state', 'PredictionDate'], inplace=True)

In [None]:
# Joining the DataFrames
Actuals = Returns_h1.join([Returns_h3, Returns_h6, Returns_h12], how='outer')
Forecasts = prediction_h1.join([prediction_h3, prediction_h6, prediction_h12], how='outer')

In [None]:
# Renaming columns
Forecasts.rename(columns={'Prediction_h3': 'h3',
                             'Prediction_h6': 'h6',
                             'Prediction_h12': 'h12'}, inplace=True)

Actuals.rename(columns={'actuals_h3': 'h3',
                             'actuals_h6': 'h6',
                             'actuals_h12': 'h12'}, inplace=True)

In [None]:
# Resetting index
Forecasts = Forecasts.reset_index(level='state')
Actuals = Actuals.reset_index(level='state')

In [None]:
# Save to csv
Forecasts.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)Predictions.csv")
Actuals.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)Actuals.csv")

In [None]:
# END