In [20]:
#This script performs several setps of time series analyis
# First section: Order selection for ARIMAX model based on lowest AIC and BIC for both the base case (without ex. variables)
# and the complete model using sentiment scores
# Second section: Testing for model performance improvement when including sentiment scores (either msci and own sentiment score)

# The aim of our research is not to identify multicollinearity, endogeneity, the only purpose is to test in all the following scripts whether the predictive power of two different models can be improved.

import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("./tsdata/modeldata.csv", index_col='Date', parse_dates=True)
df.index.freq = "MS"


scaler = MinMaxScaler()
df[['returns', 'msci']] = scaler.fit_transform(df[['returns', 'msci']])

df['returns_lag1'] = df['returns'].shift(1)
df['msci_lag1'] = df['msci'].shift(1)
df['sentiment_lag1'] = df['sentiment'].shift(1)

df.dropna(inplace=True)

train = df[:'2020-01-01']
test = df['2020-01-02':]

In [7]:
#First section

def evaluateModel(data, exog, order):
    model = SARIMAX(data, exog=exog, order=(order[0], 0, order[1]), enforce_stationarity=True, enforce_invertibility=True)
    model_fit = model.fit(disp=False, maxiter = 300)
    return order, model_fit.aic, model_fit.bic

orders = [(p, q) for p in range(5) for q in range(5) if (p, q) != (0, 0)]

results_base = []
results_exog = []

#Base model
for order in orders:
    try:
        order, aic, bic = evaluateModel(returntrain, None, order)
        results_base.append([order, aic, bic])
    except Exception as e:
        print(f"Order {order} failed: {e}")

#Including sentiment scores
for order in orders:
    try:
        order, aic_exo, bic_exo = evaluateModel(returntrain, mscitrain, order)
        results_exog.append([order, aic_exo, bic_exo])
    except Exception as e:
        print(f"Order {order} with exogenous variables failed: {e}")

df_base = pd.DataFrame(results_base, columns=['Order', 'AIC', 'BIC'])
df_exog = pd.DataFrame(results_exog, columns=['Order', 'AIC', 'BIC'])

In [5]:
min_aic_base_index = df_base['AIC'].idxmin()
min_bic_base_index = df_base['BIC'].idxmin()

min_aic_base_order = df_base.loc[min_aic_base_index]
min_bic_base_order = df_base.loc[min_bic_base_index]

min_aic_exog_index = df_exog['AIC'].idxmin()
min_bic_exog_index = df_exog['BIC'].idxmin()

min_aic_exog_order = df_exog.loc[min_aic_exog_index]
min_bic_exog_order = df_exog.loc[min_bic_exog_index]

# Printing the results
print("Base model with lowest AIC:", min_aic_base_order)
print("Base model with lowest BIC:", min_bic_base_order)
print("Exog model with lowest AIC:", min_aic_exog_order)
print("Exog model with lowest BIC:", min_bic_exog_order)

Base model with lowest AIC: Order        (0, 1)
AIC      -265.93149
BIC     -260.802794
Name: 0, dtype: object
Base model with lowest BIC: Order        (0, 1)
AIC      -265.93149
BIC     -260.802794
Name: 0, dtype: object
Exog model with lowest AIC: Order        (0, 1)
AIC     -264.016992
BIC     -256.323947
Name: 0, dtype: object
Exog model with lowest BIC: Order        (0, 1)
AIC     -264.016992
BIC     -256.323947
Name: 0, dtype: object


In [6]:
min_aic_base = df_base['AIC'].min()
min_bic_base = df_base['BIC'].min()
best_base_model = df_base[(df_base['AIC'] == min_aic_base) & (df_base['BIC'] == min_bic_base)]


min_aic_exog = df_exog['AIC'].min()
min_bic_exog = df_exog['BIC'].min()
best_exog_model = df_exog[(df_exog['AIC'] == min_aic_exog) & (df_exog['BIC'] == min_bic_exog)]

print("Best base model (lowest AIC and BIC):")
print(best_base_model)
print("\nBest exogenous model (lowest AIC and BIC):")
print(best_exog_model)

Best base model (lowest AIC and BIC):
    Order        AIC         BIC
0  (0, 1) -265.93149 -260.802794

Best exogenous model (lowest AIC and BIC):
    Order         AIC         BIC
0  (0, 1) -264.016992 -256.323947
