In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("./tsdata/modeldata.csv", index_col='Date', parse_dates=True)
df.index.freq = "MS"
df['msci_lag1'] = df['msci'].shift(1)
df['sentiment_lag1'] = df['sentiment'].shift(1)
df = df.dropna()
print(df.head())

             returns      msci  sentiment  msci_lag1  sentiment_lag1
Date                                                                
2012-02-01  0.146806  0.005837   0.148969   0.099222       -0.008786
2012-03-01  0.083715  0.017510  -0.140396   0.005837        0.148969
2012-04-01  0.001341  0.003891  -0.019119   0.017510       -0.140396
2012-05-01 -0.047849  0.056420  -0.070178   0.003891       -0.019119
2012-06-01 -0.005572 -0.118677   0.241201   0.056420       -0.070178


In [15]:
def mape(actual, pred):
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

def rmse(actual, pred):
    return np.sqrt(mean_squared_error(actual, pred))

results = pd.DataFrame(columns=['Date', 'Model', 'MAPE', 'RMSE', 'Correct Sign', 'AIC', 'BIC'])

window_size = 36

for i in range(window_size, len(df) - 1):
    train = df.iloc[i - window_size:i]
    test = df.iloc[i:i+1]

    y_train = train['returns']
    y_test = test['returns']

    X_msci_train = train[["msci_lag1"]]
    X_sent_train = train[["sentiment_lag1"]]
    X_both_train = train[["msci_lag1", "sentiment_lag1"]]

    X_msci_test = test[["msci_lag1"]]
    X_sent_test = test[["sentiment_lag1"]]
    X_both_test = test[["msci_lag1", "sentiment_lag1"]]

    models = {'Base': (y_train, None),
              'MSCI': (y_train, X_msci_train),
              'Sentiment': (y_train, X_sent_train),
              'Both': (y_train, X_both_train)}
    test_exog = {'Base': None, 'MSCI': X_msci_test, 'Sentiment': X_sent_test, 'Both': X_both_test}

    for model_name, (y, X) in models.items():
        model = SARIMAX(y, X, order=(1, 0, 1), enforce_stationarity=True, enforce_invertibility=True)
        fitted_model = model.fit(disp=False, maxiter=300)
        prediction = fitted_model.get_forecast(steps=1, exog=test_exog[model_name]).predicted_mean
        
        mape_score = mape(y_test.values, prediction.values)
        rmse_score = rmse(y_test.values, prediction.values)
        correct_sign = (np.sign(y_test.values) == np.sign(prediction.values)).astype(int)

        
        new_row = pd.DataFrame({
            'Date': [test.index[0]],
            'Model': [model_name],
            'MAPE': [mape_score],
            'RMSE': [rmse_score],
            'Correct Sign': [correct_sign[0]],
            'AIC': [fitted_model.aic],
            'BIC': [fitted_model.bic]
        })
        results = pd.concat([results, new_row], ignore_index=True)

average_metrics = results.groupby('Model').agg({
    'MAPE': 'mean',
    'RMSE': 'mean',
    'AIC': 'mean',
    'BIC': 'mean',
    'Correct Sign': 'mean'
}).reset_index()

average_metrics.rename(columns={
    'MAPE': 'Average MAPE',
    'RMSE': 'Average RMSE',
    'Correct Sign': 'Correct Sign Percentage'
}, inplace=True)


average_metrics

Unnamed: 0,Model,Average MAPE,Average RMSE,AIC,BIC,Correct Sign Percentage
0,Base,138.083023,0.059705,-92.833047,-88.08249,0.56383
1,Both,139.991878,0.059828,-90.446027,-82.528432,0.595745
2,MSCI,132.343733,0.059784,-91.494353,-85.160277,0.553191
3,Sentiment,142.347203,0.059677,-91.66231,-85.328234,0.62766
