In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('SPX.csv') 
df.drop(["Open", "High", "Low", "Volume"], axis=1, inplace = True)
df["Date"] = pd.to_datetime(df['Date'], format='%d/%m/%Y') 
df.set_index('Date', inplace=True)

train_size = int(len(df) * 0.9)
train = df['Close'][:train_size]
test = df['Close'][train_size:]

# The Auto-Regressive Model

In [35]:
from statsmodels.tsa.ar_model import AutoReg # AR or AutoReg

model = AutoReg(train, lags=1500) # 'lag' here can be modified
model_fit = model.fit()


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


The parameter names will change after 0.12 is released. Set old_names to False to use the new names now. Set old_names to True to use the old names. 



In [36]:
# No. of lag variables
print(model_fit.ar_lags) # .k_ar or .ar_lags
# coef. of lag variables
print(model_fit.params)

#model_fit.summary()

[   1    2    3 ... 1498 1499 1500]
intercept      0.163366
Close.L1       0.950204
Close.L2       0.003312
Close.L3       0.047894
Close.L4      -0.012698
                 ...   
Close.L1496   -0.006847
Close.L1497    0.020957
Close.L1498   -0.005271
Close.L1499    0.005677
Close.L1500   -0.004370
Length: 1501, dtype: float64


In [37]:
df['predictions'] = np.nan
df['predictions'][train_size:] = model_fit.predict(start=train_size, end=len(df)-1, dynamic=False)
display(df)


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


Only PeriodIndexes, DatetimeIndexes with a frequency set, RangesIndexes, and Int64Indexes with a unit increment support extending. The index is set will contain the position relative to the data length.



Unnamed: 0_level_0,Close,predictions,walkForward_predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980-05-01,105.46,,
1980-05-02,105.58,,
1980-05-05,106.38,,
1980-05-06,106.25,,
1980-05-07,107.18,,
...,...,...,...
2022-07-13,3801.78,2993.676454,3824.670937
2022-07-14,3790.38,2980.880708,3796.666589
2022-07-15,3863.16,2981.450109,3792.989210
2022-07-18,3830.85,2984.828062,3863.298567


In [38]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(df['Close'][train_size:], df['predictions'][train_size:])
# can compare mse to naive forecast's
fig = px.line(df, y=["Close", "predictions"], title=f"Auto-Regressive Forecast with MSE = {mse}")
fig.show()

In [43]:
# diff = pd.DataFrame()
# diff['Close'] = df['Close'].diff()
# diff['Pred'] = df['predictions'].diff()
# diff['Pred-Close'] = df['predictions'] - df['Close'].shift(1)
# diff = diff.dropna()

# hit1 = 0
# hit2 = 0
# for i in range(len(diff)):
#     d = diff.iloc[i]
#     if (d['Close'] > 0 and d['Pred'] > 0) or (d['Close'] < 0 and d['Pred'] < 0):
#         hit1 += 1
#     if (d['Close'] > 0 and d['Pred-Close'] > 0) or (d['Close'] < 0 and d['Pred-Close'] < 0):
#         hit2 += 1
# hit1 /= len(diff)
# hit2 /= len(diff)
# print(hit1, hit2)

0.5028195488721805 0.45770676691729323


# Walk Forward Validation

In [10]:
walkForward_train = train
pred = []
for t in test:
    model = AutoReg(walkForward_train, lags=10)
    model_fit = model.fit()

    y = model_fit.predict(start=len(walkForward_train), end=len(df)-1)
    #print(y.values[0])

    pred.append(y.values[0])
    walkForward_train = np.append(walkForward_train, t)
    walkForward_train = pd.Series(walkForward_train)

df['walkForward_predictions'] = np.nan 
df['walkForward_predictions'][train_size:] = pred


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


The parameter names will change after 0.12 is released. Set old_names to False to use the new names now. Set old_names to True to use the old names. 


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


Only PeriodIndexes, DatetimeIndexes with a frequency set, RangesIndexes, and Int64Indexes with a unit increment support extending. The index is set will contain the position relative to the data length.



In [11]:
mse = mean_squared_error(df['Close'][train_size:], df['walkForward_predictions'][train_size:])

fig = px.line(df, y=["Close", "walkForward_predictions"], title=f"Auto-Regressive Forecast + Walk Forward Validation with MSE = {mse}")
fig.show()

In [42]:
# diff = pd.DataFrame()
# diff['Close'] = df['Close'].diff()
# diff['Pred'] = df['walkForward_predictions'].diff()
# diff['Pred-Close'] = df['walkForward_predictions'] - df['Close'].shift(1)
# diff = diff.dropna()

# hit1 = 0
# hit2 = 0
# for i in range(len(diff)):
#     d = diff.iloc[i]
#     if (d['Close'] > 0 and d['Pred'] > 0) or (d['Close'] < 0 and d['Pred'] < 0):
#         hit1 += 1
#     if (d['Close'] > 0 and d['Pred-Close'] > 0) or (d['Close'] < 0 and d['Pred-Close'] < 0):
#         hit2 += 1
# hit1 /= len(diff)
# hit2 /= len(diff)
# print(hit1, hit2)

0.4934210526315789 0.5178571428571429
