In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.graphics.tsaplots as stats_graph_tsa
import statsmodels.tsa.stattools as stat_tsa_stats
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from pandas_datareader import data as wb
from scipy.stats.distributions import chi2
from datetime import datetime

sns.set()

**Treating Data**

In [3]:
raw_csv_data = pd.read_csv('../01 - Intro/IndexE8.csv')
df_complete = raw_csv_data.copy()
df_complete['market_value'] = df_complete.ftse
df_complete.date = pd.to_datetime(df_complete.date, dayfirst = True)
df_complete.date = df_complete.set_index('date', inplace = True)
df_complete = df_complete.asfreq('b')
df_complete = df_complete.fillna(method = 'ffill')
del df_complete['dax']
del df_complete['ftse']
del df_complete['nikkei']
del df_complete['spx']
size = int(len(df_complete) * 0.8)
df, df_test = df_complete.iloc[:size], df_complete.iloc[size:]

''' Defining the return column '''
df['returns'] = df.market_value.pct_change(1).mul(100)
df = df.iloc[1:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['returns'] = df.market_value.pct_change(1).mul(100)


In [4]:
def LLR_Test (mod_1, mod_2, DF = 1):
    L1 = mod_1.fit().llf
    L2 = mod_2.fit().llf

    LR = (2 * (L2 - L1))
    pvalue = chi2.sf(LR, DF).round(4)

    return pvalue

**AR(1) Simple for Return**

In [5]:
'''
    The P values of both parameters are over .05, meaning that neither is significantly different from 0
    In conclusion, in this case the model holds no real predictive power.
    The more easily yesterday price is affected by higher lags, the more inaccurate its coefficient becomes.
'''
mod_ret_ar_1 = ARIMA(df.returns, order = (1, 0, 0))
res_ret_ar_1 = mod_ret_ar_1.fit()
res_ret_ar_1.summary()

0,1,2,3
Dep. Variable:,returns,No. Observations:,5020.0
Model:,"ARIMA(1, 0, 0)",Log Likelihood,-7929.749
Date:,"Thu, 08 Dec 2022",AIC,15865.497
Time:,11:13:05,BIC,15885.061
Sample:,01-10-1994,HQIC,15872.353
,- 04-05-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0188,0.016,1.143,0.253,-0.013,0.051
ar.L1,-0.0230,0.009,-2.697,0.007,-0.040,-0.006
sigma2,1.3790,0.014,100.092,0.000,1.352,1.406

0,1,2,3
Ljung-Box (L1) (Q):,0.01,Jarque-Bera (JB):,7872.72
Prob(Q):,0.93,Prob(JB):,0.0
Heteroskedasticity (H):,2.04,Skew:,-0.03
Prob(H) (two-sided):,0.0,Kurtosis:,9.13


**Higher-Lag AR Models for Return**

In [6]:
'''
    The P Value from the second value is zero, meaning that Φ² is significantly different from , and should be
    included in the model. Also, the P Value from the first lag decreased.
'''
mod_ret_ar_2 = ARIMA(df.returns, order = (2, 0, 0))
res_ret_ar_2 = mod_ret_ar_2.fit()
res_ret_ar_2.summary()

0,1,2,3
Dep. Variable:,returns,No. Observations:,5020.0
Model:,"ARIMA(2, 0, 0)",Log Likelihood,-7923.456
Date:,"Thu, 08 Dec 2022",AIC,15854.911
Time:,11:13:06,BIC,15880.996
Sample:,01-10-1994,HQIC,15864.052
,- 04-05-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0188,0.016,1.194,0.233,-0.012,0.050
ar.L1,-0.0242,0.009,-2.799,0.005,-0.041,-0.007
ar.L2,-0.0500,0.008,-6.371,0.000,-0.065,-0.035
sigma2,1.3756,0.014,99.592,0.000,1.348,1.403

0,1,2,3
Ljung-Box (L1) (Q):,0.08,Jarque-Bera (JB):,7730.29
Prob(Q):,0.77,Prob(JB):,0.0
Heteroskedasticity (H):,2.03,Skew:,-0.06
Prob(H) (two-sided):,0.0,Kurtosis:,9.08


In [7]:
'''
    There is a possibility that the change in the log likelihood is also significant.
    To be sure about that, we'll run the LLR test. OBS: Make sure to put the simple model first.
'''
LLR_Test(mod_ret_ar_1, mod_ret_ar_2)

0.0004

In [8]:
mod_ret_ar_3 = ARIMA(df.returns, order = (3, 0 ,0))
res_ret_ar_3 = mod_ret_ar_3.fit()
res_ret_ar_3.summary()

0,1,2,3
Dep. Variable:,returns,No. Observations:,5020.0
Model:,"ARIMA(3, 0, 0)",Log Likelihood,-7906.738
Date:,"Thu, 08 Dec 2022",AIC,15823.477
Time:,11:13:08,BIC,15856.083
Sample:,01-10-1994,HQIC,15834.902
,- 04-05-2013,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0189,0.015,1.272,0.203,-0.010,0.048
ar.L1,-0.0283,0.009,-3.199,0.001,-0.046,-0.011
ar.L2,-0.0521,0.008,-6.684,0.000,-0.067,-0.037
ar.L3,-0.0815,0.008,-10.192,0.000,-0.097,-0.066
sigma2,1.3664,0.014,94.518,0.000,1.338,1.395

0,1,2,3
Ljung-Box (L1) (Q):,0.07,Jarque-Bera (JB):,6859.16
Prob(Q):,0.79,Prob(JB):,0.0
Heteroskedasticity (H):,2.02,Skew:,-0.15
Prob(H) (two-sided):,0.0,Kurtosis:,8.72


In [9]:
LLR_Test(mod_ret_ar_2, mod_ret_ar_3)

0.0

**Normalizing Values**

In [10]:
'''
    In general, by normalizing 2 Time Series we can compare how well they perform relative to one another.

    1.) Set some BenchMark -> Used for the central piece we use to compare all subsequence values of the TimeSeries
'''
benchmark = df.market_value.iloc[0]

# Divide all the values of the set with this number and multiply by 100
df['norm'] = df.market_value.div(benchmark).mul(100)

# AR work best with Stationary Data, so we need to run Dick-Fuller Test, before we are certain
# that the values are useful.

## OBS: The P Value suggest that this that is Non-Stationarity, and Normalized price does not
##      result in stationary data, therefore we wont be able to use an AR Model with normalized
##      prices.
stat_tsa_stats.adfuller(df.norm)

(-1.8928710132672344,
 0.33542220743300755,
 6,
 5013,
 {'1%': -3.431655136974821,
  '5%': -2.8621167297421373,
  '10%': -2.567076993884522},
 19553.90073102193)

**Normalized Returns**

In [11]:
'''
    Normalized Returns -> They account for the absolute profitability of the investment in contrast
                          to prices, allowing to compare the relative profitability as opposed to
                          non-normalized returns.
'''
benchmark_ret = df.returns.iloc[0]
df['norm_ret'] = df.returns.div(benchmark_ret).mul(100)

# Significally less statistical value and more P Value in this case, meaning that this set
# is probably stationary
stat_tsa_stats.adfuller(df.norm_ret)

(-12.770265719497258,
 7.798058336037547e-24,
 32,
 4987,
 {'1%': -3.431661944885779,
  '5%': -2.8621197374408225,
  '10%': -2.5670785949998973},
 80114.49116124898)