# Import

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

from google.colab import drive
drive.mount('/content/drive')

# Actuals for mean benchmark

In [None]:
# Load data
path1 = "/content/drive/MyDrive/Thesis/Data/Merging CleanPrice & Features.csv"
price = pd.read_csv(path1, sep=',')

columns = price.copy()
price.set_index('Date', inplace=True)
RawData = price.copy()

# Making Mean Benchmark

In [None]:
# Making rolling mean benchmark
columns = ['GEO_Name', 'Log_Return_h1', 'Log_Return_h3', 'Log_Return_h6', 'Log_Return_h12']
actual_data = RawData[columns].copy()
mean_dfs = []

for name, group in actual_data.groupby('GEO_Name'):
    group['Mean_h1'] = group['Log_Return_h1'].rolling(window=330).mean().shift(1)
    group['Mean_h3'] = group['Log_Return_h3'].rolling(window=330).mean().shift(3)
    group['Mean_h6'] = group['Log_Return_h6'].rolling(window=330).mean().shift(6)
    group['Mean_h12'] = group['Log_Return_h12'].rolling(window=330).mean().shift(12)
    # Append the results to df
    mean_dfs.append(group)

actual_data = pd.concat(mean_dfs).reset_index()
MeanBenchmark = actual_data[['Date', 'GEO_Name', 'Mean_h1', 'Mean_h3', 'Mean_h6', 'Mean_h12']].rename(columns={'GEO_Name': 'state', 'Mean_h1': 'h1', 'Mean_h3': 'h3', 'Mean_h6': 'h6', 'Mean_h12': 'h12'})

# Load the data

In [None]:
# Loading predictions and corresponding actuals
actuals = pd.read_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(1)Actuals.csv", sep=',')
AR_1 = pd.read_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(1)Predictions.csv", sep=',')
AR_optimal = pd.read_csv("/content/drive/MyDrive/Thesis/Models/Predictions/AR(Optimal)Predictions.csv", sep=',')
ARIMA = pd.read_csv("/content/drive/MyDrive/Thesis/Models/Predictions/ARIMAPredictions.csv", sep=',')
RandomForest = pd.read_csv("/content/drive/MyDrive/Thesis/Models/Predictions/RandomForestPredictions.csv", sep=',')
XGBoost = pd.read_csv("/content/drive/MyDrive/Thesis/Models/Predictions/XGBoostPredictions.csv", sep=',')

In [None]:
# Renaming the date column to date
actuals.rename(columns={'PredictionDate': 'Date'}, inplace=True)
AR_1.rename(columns={'PredictionDate': 'Date'}, inplace=True)
AR_optimal.rename(columns={'PredictionDate': 'Date'}, inplace=True)
ARIMA.rename(columns={'PredictionDate': 'Date'}, inplace=True)

RandomForest.rename(columns={'OriginalIndex': 'Date'}, inplace=True)
XGBoost.rename(columns={'OriginalIndex': 'Date'}, inplace=True)

RandomForest.rename(columns={'State': 'state'}, inplace=True)
XGBoost.rename(columns={'State': 'state'}, inplace=True)

In [None]:
# Making Combination forecasts
combined_df1 = pd.concat([AR_optimal, ARIMA, RandomForest, XGBoost], ignore_index=True)
averaged_predictions1 = combined_df1.groupby(['Date', 'state']).mean().reset_index()
Combination_AR_1 = averaged_predictions1.sort_values(by=['state', 'Date']).reset_index(drop=True)

In [None]:
# Removing 'USA' from the predictions and actuals
actuals = actuals[actuals['state'] != 'USA']
AR_1 = AR_1[AR_1['state'] != 'USA']
AR_optimal = AR_optimal[AR_optimal['state'] != 'USA']
ARIMA = ARIMA[ARIMA['state'] != 'USA']
RandomForest = RandomForest[RandomForest['state'] != 'USA']
XGBoost = XGBoost[XGBoost['state'] != 'USA']
Combination_AR_1 = Combination_AR_1[Combination_AR_1['state'] != 'USA']
MeanBenchmark = MeanBenchmark[MeanBenchmark['state'] != 'USA']

In [None]:
# Setting the dataframes to datetime
actuals['Date'] = pd.to_datetime(actuals['Date'])
AR_1['Date'] = pd.to_datetime(AR_1['Date'])
AR_optimal['Date'] = pd.to_datetime(AR_optimal['Date'])
ARIMA['Date'] = pd.to_datetime(ARIMA['Date'])
RandomForest['Date'] = pd.to_datetime(RandomForest['Date'])
XGBoost['Date'] = pd.to_datetime(XGBoost['Date'])
Combination_AR_1['Date'] = pd.to_datetime(Combination_AR_1['Date'])
MeanBenchmark['Date'] = pd.to_datetime(MeanBenchmark['Date'])

In [None]:
# Only indlucing excl. crisis
actuals = actuals[(actuals['Date'] >= pd.Timestamp('2011-01-01')) & (actuals['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
AR_1 = AR_1[(AR_1['Date'] >= pd.Timestamp('2011-01-01')) & (AR_1['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
AR_optimal = AR_optimal[(AR_optimal['Date'] >= pd.Timestamp('2011-01-01')) & (AR_optimal['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
ARIMA = ARIMA[(ARIMA['Date'] >= pd.Timestamp('2011-01-01')) & (ARIMA['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
XGBoost = XGBoost[(XGBoost['Date'] >= pd.Timestamp('2011-01-01')) & (XGBoost['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
RandomForest = RandomForest[(RandomForest['Date'] >= pd.Timestamp('2011-01-01')) & (RandomForest['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
Combination_AR_1 = Combination_AR_1[(Combination_AR_1['Date'] >= pd.Timestamp('2011-01-01')) & (Combination_AR_1['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)
MeanBenchmark = MeanBenchmark[(MeanBenchmark['Date'] >= pd.Timestamp('2011-01-01')) & (MeanBenchmark['Date'] <= pd.Timestamp('2019-12-01'))].reset_index(drop=True)

# MSE and MAE across states

## MSE

In [None]:
# List of models used in making MSE over time
models = {
    'Benchmark': MeanBenchmark,
    'AR_1': AR_1,
    'XGBoost': XGBoost,
    'RandomForest': RandomForest,
    'AR_optimal': AR_optimal,
    'ARIMA': ARIMA,
    'Combination_AR_1': Combination_AR_1
}

In [None]:
mse_by_state = {}

states = actuals['state'].unique()
horizons = ['h1', 'h3', 'h6', 'h12']

for state in states:
    state_mse = {}
    for model_name, df_model in models.items():
        mse_per_horizon = {}
        for horizon in horizons:
            true_values = actuals[actuals['state'] == state][horizon]
            predictions = df_model[df_model['state'] == state][horizon]

            # Calculating MSE
            mse_per_horizon[horizon] = mean_squared_error(true_values, predictions)

        # Appending to df's
        state_mse[model_name] = mse_per_horizon
    mse_by_state[state] = state_mse

In [None]:
# Creating df's for MSE
rows = []
for state, models in mse_by_state.items():
    for model, horizons in models.items():
        for horizon, MSE in horizons.items():
            rows.append({
                'State': state,
                'Model': model,
                'Horizon': horizon,
                'MSE': MSE
            })

mse_state = pd.DataFrame(rows)
mse_state.reset_index(drop=True)

Unnamed: 0,State,Model,Horizon,MSE
0,AK,Benchmark,h1,0.000009
1,AK,Benchmark,h3,0.000056
2,AK,Benchmark,h6,0.000170
3,AK,Benchmark,h12,0.000485
4,AK,AR_1,h1,0.000005
...,...,...,...,...
1395,WY,ARIMA,h12,0.000431
1396,WY,Combination_AR_1,h1,0.000006
1397,WY,Combination_AR_1,h3,0.000055
1398,WY,Combination_AR_1,h6,0.000114


In [None]:
mse_state.to_csv("/content/drive/MyDrive/Thesis/Performance Calculations/MSE_States.csv")
mse_state.to_excel("/content/drive/MyDrive/Thesis/Performance Calculations/MSE_States.xlsx")

## MAE

In [None]:
# List of models used in making MSE over time
models = {
    'Benchmark': MeanBenchmark,
    'AR_1': AR_1,
    'XGBoost': XGBoost,
    'RandomForest': RandomForest,
    'AR_optimal': AR_optimal,
    'ARIMA': ARIMA,
    'Combination_AR_1': Combination_AR_1
}

In [None]:
mae_by_state = {}
states = actuals['state'].unique()
horizons = ['h1', 'h3', 'h6', 'h12']

for state in states:
    state_mae = {}
    for model_name, df_model in models.items():
        mae_per_horizon = {}
        for horizon in horizons:
            true_values = actuals[actuals['state'] == state][horizon]
            predictions = df_model[df_model['state'] == state][horizon]

            # Calculating MAE
            mae_per_horizon[horizon] = mean_absolute_error(true_values, predictions)

        state_mae[model_name] = mae_per_horizon
    mae_by_state[state] = state_mae

In [None]:
# Creating df's for MAE
rows = []
for state, models in mae_by_state.items():
    for model, horizons in models.items():
        for horizon, MAE in horizons.items():
            rows.append({
                'State': state,
                'Model': model,
                'Horizon': horizon,
                'MAE': MAE
            })

mae_state = pd.DataFrame(rows)
mae_state.reset_index(drop=True)

In [None]:
mae_state.to_csv("/content/drive/MyDrive/Thesis/Performance Calculations/MAE_States.csv")
mae_state.to_excel("/content/drive/MyDrive/Thesis/Performance Calculations/MAE_States.xlsx")

# Ratio model comparisons (MEAN)

## MSE Ratios

In [None]:
# List of model dataframes and their identifiers
models = {
    'Benchmark': MeanBenchmark,
    'AR_1': AR_1,
    'XGBoost': XGBoost,
    'RandomForest': RandomForest,
    'AR_optimal': AR_optimal,
    'ARIMA': ARIMA,
    'Combination': Combination_AR_1
}

In [None]:
mse_by_state = {}

states = actuals['state'].unique()
horizons = ['h1', 'h3', 'h6', 'h12']

for state in states:
    state_mse = {}
    for model_name, df_model in models.items():
        mse_per_horizon = {}
        for horizon in horizons:
            true_values = actuals[actuals['state'] == state][horizon]
            predictions = df_model[df_model['state'] == state][horizon]

            # Calculating MSE
            mse_per_horizon[horizon] = mean_squared_error(true_values, predictions)

        # Appending to df's
        state_mse[model_name] = mse_per_horizon
    mse_by_state[state] = state_mse

In [None]:
# Creating Ratios compared to benchmark
mse_ratios_by_state = {}

for state, mse_data in mse_by_state.items():
    state_ratios = {}
    benchmark_mse = mse_data['Benchmark']
    for model, mses in mse_data.items():
        ratios = {horizon: mse / benchmark_mse[horizon] if benchmark_mse[horizon] != 0 else None for horizon, mse in mses.items()}
        state_ratios[model] = ratios
    mse_ratios_by_state[state] = state_ratios

In [None]:
# Appending data to df's skipping the benchmark
data = []
for state, models in mse_ratios_by_state.items():
    for model, ratios in models.items():
        if model != 'Benchmark':
            for horizon, ratio in ratios.items():
                data.append((state, model, horizon, ratio))

df_ratios = pd.DataFrame(data, columns=['State', 'Model', 'Horizon', 'MSE Ratio']).sort_values(by=['State', 'Horizon'])
df_ratios.reset_index(drop=True)

In [None]:
df_ratios.to_csv("/content/drive/MyDrive/Thesis/Performance Calculations/MSERatiosAllHorizonsMEANBenchmark.csv")
df_ratios.to_excel("/content/drive/MyDrive/Thesis/Performance Calculations/MSERatiosAllHorizonsMEANBenchmark.xlsx")

## MAE Ratios

In [None]:
# List of model dataframes and their identifiers
models = {
    'Benchmark': MeanBenchmark,
    'AR_1': AR_1,
    'XGBoost': XGBoost,
    'RandomForest': RandomForest,
    'AR_optimal': AR_optimal,
    'ARIMA': ARIMA,
    'Combination': Combination_AR_1
}

In [None]:
mae_by_state = {}
states = actuals['state'].unique()
horizons = ['h1', 'h3', 'h6', 'h12']

for state in states:
    state_mae = {}
    for model_name, df_model in models.items():
        mae_per_horizon = {}
        for horizon in horizons:
            true_values = actuals[actuals['state'] == state][horizon]
            predictions = df_model[df_model['state'] == state][horizon]

            # Calculating MAE
            mae_per_horizon[horizon] = mean_absolute_error(true_values, predictions)

        state_mae[model_name] = mae_per_horizon
    mae_by_state[state] = state_mae

In [None]:
# Creating Ratios compared to benchmark
mae_ratios_by_state = {}

for state, mae_data in mae_by_state.items():
    state_ratios = {}
    benchmark_mae = mae_data['Benchmark']
    for model, maes in mae_data.items():
        ratios = {horizon: mae / benchmark_mae[horizon] if benchmark_mae[horizon] != 0 else None for horizon, mae in maes.items()}
        state_ratios[model] = ratios
    mae_ratios_by_state[state] = state_ratios

In [None]:
# Appending data to df's skipping the benchmark
data = []
for state, models in mae_ratios_by_state.items():
    for model, ratios in models.items():
        if model != 'Benchmark':
            for horizon, ratio in ratios.items():
                data.append((state, model, horizon, ratio))

df_ratios = pd.DataFrame(data, columns=['State', 'Model', 'Horizon', 'mae Ratio']).sort_values(by=['State', 'Horizon'])
df_ratios.reset_index(drop=True)

In [None]:
df_ratios.to_csv("/content/drive/MyDrive/Thesis/Performance Calculations/MAERatiosAllHorizonsMEANBenchmark.csv")
df_ratios.to_excel("/content/drive/MyDrive/Thesis/Performance Calculations/MAERatiosAllHorizonsMEANBenchmark.xlsx")

# Ratio model comparisons (AR_1)

## MSE Ratios

In [None]:
# List of model dataframes and their identifiers
models = {
    'Benchmark': AR_1,
    'XGBoost': XGBoost,
    'RandomForest': RandomForest,
    'AR_optimal': AR_optimal,
    'ARIMA': ARIMA,
    'Combination': Combination_AR_1
}

In [None]:
mse_by_state = {}

states = actuals['state'].unique()
horizons = ['h1', 'h3', 'h6', 'h12']

for state in states:
    state_mse = {}
    for model_name, df_model in models.items():
        mse_per_horizon = {}
        for horizon in horizons:
            true_values = actuals[actuals['state'] == state][horizon]
            predictions = df_model[df_model['state'] == state][horizon]

            # Calculating MSE
            mse_per_horizon[horizon] = mean_squared_error(true_values, predictions)

        # Appending to df's
        state_mse[model_name] = mse_per_horizon
    mse_by_state[state] = state_mse

In [None]:
# Creating Ratios compared to benchmark
mse_ratios_by_state = {}

for state, mse_data in mse_by_state.items():
    state_ratios = {}
    benchmark_mse = mse_data['Benchmark']
    for model, mses in mse_data.items():
        ratios = {horizon: mse / benchmark_mse[horizon] if benchmark_mse[horizon] != 0 else None for horizon, mse in mses.items()}
        state_ratios[model] = ratios
    mse_ratios_by_state[state] = state_ratios

In [None]:
# Appending data to df's skipping the benchmark
data = []
for state, models in mse_ratios_by_state.items():
    for model, ratios in models.items():
        if model != 'Benchmark':
            for horizon, ratio in ratios.items():
                data.append((state, model, horizon, ratio))

df_ratios = pd.DataFrame(data, columns=['State', 'Model', 'Horizon', 'MSE Ratio']).sort_values(by=['State', 'Horizon'])
df_ratios.reset_index(drop=True)

In [None]:
df_ratios.to_csv("/content/drive/MyDrive/Thesis/Performance Calculations/MSERatiosAllHorizonsAR(1)Benchmark.csv")
df_ratios.to_excel("/content/drive/MyDrive/Thesis/Performance Calculations/MSERatiosAllHorizonsAR(1)Benchmark.xlsx")

## MAE Ratios

In [None]:
# List of model dataframes and their identifiers
models = {
    'Benchmark': AR_1,
    'XGBoost': XGBoost,
    'RandomForest': RandomForest,
    'AR_optimal': AR_optimal,
    'ARIMA': ARIMA,
    'Combination': Combination_AR_1
}

In [None]:
mae_by_state = {}
states = actuals['state'].unique()
horizons = ['h1', 'h3', 'h6', 'h12']

for state in states:
    state_mae = {}
    for model_name, df_model in models.items():
        mae_per_horizon = {}
        for horizon in horizons:
            true_values = actuals[actuals['state'] == state][horizon]
            predictions = df_model[df_model['state'] == state][horizon]

            # Calculating MAE
            mae_per_horizon[horizon] = mean_absolute_error(true_values, predictions)

        state_mae[model_name] = mae_per_horizon
    mae_by_state[state] = state_mae

In [None]:
# Creating Ratios compared to benchmark
mae_ratios_by_state = {}

for state, mae_data in mae_by_state.items():
    state_ratios = {}
    benchmark_mae = mae_data['Benchmark']
    for model, maes in mae_data.items():
        ratios = {horizon: mae / benchmark_mae[horizon] if benchmark_mae[horizon] != 0 else None for horizon, mae in maes.items()}
        state_ratios[model] = ratios
    mae_ratios_by_state[state] = state_ratios

In [None]:
# Appending data to df's skipping the benchmark
data = []
for state, models in mae_ratios_by_state.items():
    for model, ratios in models.items():
        if model != 'Benchmark':
            for horizon, ratio in ratios.items():
                data.append((state, model, horizon, ratio))

df_ratios = pd.DataFrame(data, columns=['State', 'Model', 'Horizon', 'mae Ratio']).sort_values(by=['State', 'Horizon'])
df_ratios.reset_index(drop=True)

In [None]:
df_ratios.to_csv("/content/drive/MyDrive/Thesis/Performance Calculations/MAERatiosAllHorizonsAR(1)Benchmark.csv")
df_ratios.to_excel("/content/drive/MyDrive/Thesis/Performance Calculations/MAERatiosAllHorizonsAR(1)Benchmark.xlsx")

In [None]:
# END