In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.graphics.tsaplots as stats_graph_tsa
import statsmodels.tsa.stattools as stat_tsa_stats
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from pandas_datareader import data as wb
from scipy.stats.distributions import chi2
from datetime import datetime

sns.set()

**Treating Data**

In [2]:
raw_csv_data = pd.read_csv('../01 - Intro/IndexE8.csv')
df_complete = raw_csv_data.copy()

df_complete['market_value'] = df_complete.spx

# Transform Date column to the type Date
df_complete.date = pd.to_datetime(df_complete.date, dayfirst = True)

# Setting Date as Index
df_complete.date = df_complete.set_index('date', inplace = True)

# Setting Frequency as Business Day
df_complete = df_complete.asfreq('b')

# Fill empty values -> Front Fill
df_complete = df_complete.fillna(method = 'ffill')

del df_complete['dax']
del df_complete['ftse']
del df_complete['nikkei']
del df_complete['spx']

# Getting 80% of the Data Set
size = int(len(df_complete) * 0.8)

# Setting Train and Test
df, df_test = df_complete.iloc[:size], df_complete.iloc[size:]

In [3]:
def LLR_Test (mod_1, mod_2, DF = 1):
    L1 = mod_1.fit().llf
    L2 = mod_2.fit().llf

    LR = (2 * (L2 - L1))
    pvalue = chi2.sf(LR, DF).round(4)

    return pvalue

**Normalized Returns**

In [19]:
'''
    Normalized Returns -> They account for the absolute profitability of the investment in contrast
                          to prices, allowing to compare the relative profitability as opposed to
                          non-normalized returns.
'''
''' Defining the return column '''
df['returns'] = df.market_value.pct_change(1).mul(100)
df = df.iloc[1:]

benchmark_ret = df.returns.iloc[0]
df['norm_ret'] = df.returns.div(benchmark_ret).mul(100)

# Significally less statistical value and more P Value in this case, meaning that this set
# is probably stationary
stat_tsa_stats.adfuller(df.norm_ret)

(-17.031714557655963,
 8.316113404053451e-30,
 17,
 5001,
 {'1%': -3.431658270286891,
  '5%': -2.8621181140235223,
  '10%': -2.5670777307913744},
 76192.03270274171)

**Fitting Normalized Return**

In [20]:
mod_norm_ret_ar_1 = ARIMA(df.norm_ret, order = (1, 0, 0))
res_norm_ret_ar_1 = mod_norm_ret_ar_1.fit()

res_norm_ret_ar_1.summary()

0,1,2,3
Dep. Variable:,norm_ret,No. Observations:,5019.0
Model:,"ARIMA(1, 0, 0)",Log Likelihood,-38344.922
Date:,"Thu, 08 Dec 2022",AIC,76695.845
Time:,11:18:04,BIC,76715.408
Sample:,01-11-1994,HQIC,76702.7
,- 04-05-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-12.8934,6.781,-1.901,0.057,-26.185,0.398
ar.L1,-0.0719,0.008,-8.507,0.000,-0.088,-0.055
sigma2,2.533e+05,2222.112,113.978,0.000,2.49e+05,2.58e+05

0,1,2,3
Ljung-Box (L1) (Q):,0.05,Jarque-Bera (JB):,14953.29
Prob(Q):,0.83,Prob(JB):,0.0
Heteroskedasticity (H):,2.22,Skew:,0.12
Prob(H) (two-sided):,0.0,Kurtosis:,11.45


In [21]:
mod_norm_ret_ar_2 = ARIMA(df.norm_ret, order = (2, 0, 0))
res_norm_ret_ar_2 = mod_norm_ret_ar_2.fit()

res_norm_ret_ar_2.summary()

0,1,2,3
Dep. Variable:,norm_ret,No. Observations:,5019.0
Model:,"ARIMA(2, 0, 0)",Log Likelihood,-38340.56
Date:,"Thu, 08 Dec 2022",AIC,76689.121
Time:,11:18:37,BIC,76715.205
Sample:,01-11-1994,HQIC,76698.261
,- 04-05-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-12.8934,6.615,-1.949,0.051,-25.858,0.071
ar.L1,-0.0748,0.009,-8.779,0.000,-0.092,-0.058
ar.L2,-0.0417,0.007,-6.031,0.000,-0.055,-0.028
sigma2,2.529e+05,2340.609,108.041,0.000,2.48e+05,2.57e+05

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,13916.02
Prob(Q):,0.97,Prob(JB):,0.0
Heteroskedasticity (H):,2.21,Skew:,0.17
Prob(H) (two-sided):,0.0,Kurtosis:,11.15


In [22]:
mod_norm_ret_ar_7 = ARIMA(df.norm_ret, order = (7, 0, 0))
res_norm_ret_ar_7 = mod_norm_ret_ar_7.fit()

res_norm_ret_ar_7.summary()

0,1,2,3
Dep. Variable:,norm_ret,No. Observations:,5019.0
Model:,"ARIMA(7, 0, 0)",Log Likelihood,-38332.011
Date:,"Thu, 08 Dec 2022",AIC,76682.022
Time:,11:19:43,BIC,76740.711
Sample:,01-11-1994,HQIC,76702.588
,- 04-05-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-12.8934,6.087,-2.118,0.034,-24.823,-0.964
ar.L1,-0.0773,0.009,-8.999,0.000,-0.094,-0.060
ar.L2,-0.0450,0.007,-6.506,0.000,-0.059,-0.031
ar.L3,-0.0140,0.009,-1.619,0.105,-0.031,0.003
ar.L4,-0.0147,0.008,-1.861,0.063,-0.030,0.001
ar.L5,-0.0385,0.007,-5.246,0.000,-0.053,-0.024
ar.L6,-0.0232,0.008,-3.048,0.002,-0.038,-0.008
ar.L7,-0.0389,0.007,-5.208,0.000,-0.054,-0.024
sigma2,2.523e+05,2385.180,105.759,0.000,2.48e+05,2.57e+05

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,13335.29
Prob(Q):,0.95,Prob(JB):,0.0
Heteroskedasticity (H):,2.21,Skew:,0.24
Prob(H) (two-sided):,0.0,Kurtosis:,10.97


**OBS:**
    We can notice that the Non-Normalized model has the same P-Values of the Normalized Return Model.
    The difference is noticed in the Loglikehood and the const value, that in general if the P-Value
    does'nt change, there is'nt much difference between the two types.