In [28]:
#This script performs several setps of time series analyis
# First section: Order selection for ARIMAX model based on lowest AIC and BIC for both the base case (without ex. variables)
# and the complete model using sentiment scores
# Second section: Testing for model performance improvement when including sentiment scores (either msci and own sentiment score)

# The aim of our research is not to identify multicollinearity, endogeneity, the only purpose is to test in all the following scripts whether the predictive power of two different models can be improved.

import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("./tsdata/modeldata.csv", index_col='Date', parse_dates=True)
df.index.freq = "MS"


scaler = MinMaxScaler()
df[['returns', 'msci']] = scaler.fit_transform(df[['returns', 'msci']])

df['msci_lag1'] = df['msci'].shift(1)
df['sentiment_lag1'] = df['sentiment'].shift(1)
df['msci_lag2'] = df['msci'].shift(2)
df['sentiment_lag2'] = df['sentiment'].shift(2)
df['msci_lag3'] = df['msci'].shift(3)
df['sentiment_lag3'] = df['sentiment'].shift(3)

df.dropna(inplace=True)

train = df[:'2020-01-01']
test = df['2020-01-02':]
train

Unnamed: 0_level_0,returns,msci,sentiment,msci_lag1,sentiment_lag1,msci_lag2,sentiment_lag2,msci_lag3,sentiment_lag3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-01,0.409642,0.688976,-0.019119,0.716535,-0.140396,0.692913,0.148969,0.881890,-0.008786
2012-05-01,0.270214,0.795276,-0.070178,0.688976,-0.019119,0.716535,-0.140396,0.692913,0.148969
2012-06-01,0.390045,0.440945,0.241201,0.795276,-0.070178,0.688976,-0.019119,0.716535,-0.140396
2012-07-01,0.582161,0.645669,0.283145,0.440945,0.241201,0.795276,-0.070178,0.688976,-0.019119
2012-08-01,0.537967,0.759843,0.232915,0.645669,0.283145,0.440945,0.241201,0.795276,-0.070178
...,...,...,...,...,...,...,...,...,...
2019-09-01,0.575030,0.814961,0.350711,0.342520,0.632366,0.688976,0.227988,0.610236,-0.774562
2019-10-01,0.632607,0.771654,0.611849,0.814961,0.350711,0.342520,0.632366,0.688976,0.227988
2019-11-01,0.566221,0.732283,-0.324535,0.771654,0.611849,0.814961,0.350711,0.342520,0.632366
2019-12-01,0.640214,0.779528,-0.452753,0.732283,-0.324535,0.771654,0.611849,0.814961,0.350711


In [40]:
y = train['returns']
#X = train.drop("returns", axis=1)
#X = train["returns", "msci"]
X = train[["returns", "sentiment", "sentiment_lag1", "sentiment_lag2", "sentiment_lag3"]]


def evaluateModel(data, exog, order):
    model = SARIMAX(data, exog=exog, order=(order[0], 0, order[1]), enforce_stationarity=True, enforce_invertibility=True)
    model_fit = model.fit(disp=False, maxiter = 300)
    return order, model_fit.aic, model_fit.bic

orders = [(p, q) for p in range(5) for q in range(5) if (p, q) != (0, 0)]

results_base = []
results_exog = []

#Base model
for order in orders:
    try:
        order, aic, bic = evaluateModel(y, None, order)
        results_base.append([order, aic, bic])
    except Exception as e:
        print(f"Order {order} failed: {e}")


#Including sentiment scores
for order in orders:
    try:
        order, aic_exo, bic_exo = evaluateModel(y, X, order)
        results_exog.append([order, aic_exo, bic_exo])
    except Exception as e:
        print(f"Order {order} with exogenous variables failed: {e}")

df_base = pd.DataFrame(results_base, columns=['Order', 'AIC', 'BIC'])
df_exog = pd.DataFrame(results_exog, columns=['Order', 'AIC', 'BIC'])

In [41]:
min_aic_base_index = df_base['AIC'].idxmin()
min_bic_base_index = df_base['BIC'].idxmin()

min_aic_base_order = df_base.loc[min_aic_base_index]
min_bic_base_order = df_base.loc[min_bic_base_index]

min_aic_exog_index = df_exog['AIC'].idxmin()
min_bic_exog_index = df_exog['BIC'].idxmin()

min_aic_exog_order = df_exog.loc[min_aic_exog_index]
min_bic_exog_order = df_exog.loc[min_bic_exog_index]

# Printing the results
print("Base model with lowest AIC:", min_aic_base_order)
print("Base model with lowest BIC:", min_bic_base_order)
print("Exog model with lowest AIC:", min_aic_exog_order)
print("Exog model with lowest BIC:", min_bic_exog_order)

Base model with lowest AIC: Order       (1, 1)
AIC     -65.592461
BIC     -57.962576
Name: 5, dtype: object
Base model with lowest BIC: Order       (1, 1)
AIC     -65.592461
BIC     -57.962576
Name: 5, dtype: object
Exog model with lowest AIC: Order         (2, 3)
AIC     -2253.857351
BIC     -2225.881108
Name: 12, dtype: object
Exog model with lowest BIC: Order         (2, 3)
AIC     -2253.857351
BIC     -2225.881108
Name: 12, dtype: object


In [34]:
model = SARIMAX(y, exog=X, order=(3, 0, 3), enforce_stationarity=True, enforce_invertibility=True)
model_fit = model.fit(disp=False, maxiter = 300)
model_fit.summary()

0,1,2,3
Dep. Variable:,returns,No. Observations:,94.0
Model:,"SARIMAX(3, 0, 3)",Log Likelihood,45.015
Date:,"Fri, 19 Apr 2024",AIC,-60.031
Time:,16:59:22,BIC,-21.881
Sample:,04-01-2012,HQIC,-44.621
,- 01-01-2020,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
msci,0.1363,0.152,0.894,0.371,-0.163,0.435
sentiment,0.0074,0.039,0.191,0.849,-0.068,0.083
msci_lag1,0.1149,0.109,1.052,0.293,-0.099,0.329
sentiment_lag1,-0.0123,0.042,-0.292,0.770,-0.095,0.070
msci_lag2,0.1486,0.144,1.028,0.304,-0.135,0.432
sentiment_lag2,0.0101,0.044,0.228,0.820,-0.077,0.097
msci_lag3,0.2314,0.134,1.727,0.084,-0.031,0.494
sentiment_lag3,-0.0857,0.039,-2.219,0.026,-0.161,-0.010
ar.L1,0.4736,0.091,5.220,0.000,0.296,0.651

0,1,2,3
Ljung-Box (L1) (Q):,0.34,Jarque-Bera (JB):,3.86
Prob(Q):,0.56,Prob(JB):,0.15
Heteroskedasticity (H):,1.17,Skew:,-0.49
Prob(H) (two-sided):,0.66,Kurtosis:,3.18
