## Setup

In [3]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import plotly.express as px

## Workflow introduction

**1.Visualize the Data**  

Plot the time series to visually inspect the trend and seasonality.
Use tools like the autocorrelation function (ACF) and partial autocorrelation function (PACF) plots to identify patterns.


**2.Assess Stationarity**

Perform statistical tests like the Augmented Dickey-Fuller (ADF) test to check for stationarity.


**3.First Differencing (Remove Trend)**

If a strong trend is present, apply first differencing to remove the trend component.
Inspect the differenced series to see if the trend has been sufficiently removed.

**4.Seasonal Differencing (Remove Seasonality)**

If minor seasonality is present, apply seasonal differencing after first differencing to remove seasonal effects.
The seasonal period should be chosen based on the frequency of the data (e.g., 365 for monthly data with yearly seasonality). If there's visible weekly seasonality, apply seasonal differencing with a lag of 5 (assuming 5 trading days in a week).

**5.Evaluate Residuals**

After differencing, evaluate the residuals to ensure they are stationary.
Use ACF and PACF plots to examine residuals for any remaining patterns.

## Workflow introduction

In [2]:
apple = pd.read_csv(r'C:\Users\12436\Desktop\BrainStation\Capstone project\capstone-Leoyuyuyu\data\OLS_df.csv', index_col=0)
apple

Unnamed: 0,Adj Close,Volume,Return,Return_tomo
1980-12-13,0.099058,469033600.0,0.000000,0.000000
1980-12-14,0.099058,469033600.0,0.000000,-0.052171
1980-12-15,0.093890,175884800.0,-0.052171,-0.073398
1980-12-16,0.086998,105728000.0,-0.073398,0.024751
1980-12-17,0.089152,86441600.0,0.024751,0.028992
...,...,...,...,...
2024-05-18,189.869995,41282900.0,0.000000,0.000000
2024-05-19,189.869995,41282900.0,0.000000,0.006162
2024-05-20,191.039993,44361300.0,0.006162,0.006857
2024-05-21,192.350006,42309400.0,0.006857,-0.007538


In [13]:
# plot the interactive plot 

figure = px.line(apple, x = apple.index, y= "Return")

figure.update_layout(yaxis_title = "Return",  title = "Daily return of Apple stock from 1980 to 2024")

figure.update_xaxes(rangeslider_visible = True)

figure.show()

# fig = px.line(apple, x=apple.index, y='Return')

# # axis labels and title
# fig.update_layout(
#     yaxis_title="Return", 
#     legend_title="", 
#     title="Daily return of Apple stock from 1980 to 2024"
# )

# # activate slider
# fig.update_xaxes(rangeslider_visible=True)

# fig.show()

In [10]:
# Use 'return today' as the time series data
return_df = apple['Return']

# Split the data into training and testing sets, cannot use the cross_validation here as we cannot use the future data to train the past model
train_size = int(len(return_df) * 0.8)      # 80% of the data will be used as the trainning data 
train, test = return_df[:train_size], return_df[train_size:]

In [None]:
# Decide the Arima model parameters:

# p auto regressive  (looking back at 5 days)


# d differencing set at 1 here 


# q moving average (0)

In [11]:
# Fit the ARIMA model
ARIMA_model = ARIMA(train, order=(5, 1, 0))  # You may need to adjust the order (p,d,q); p is the order of auto regression, 1 is the difference order to make the data stationary and q is the moving average order. 
Arima_fm1 = ARIMA_model.fit()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [5]:
# Make predictions
predictions = Arima_fm1.forecast(steps=len(test))

# Evaluate the model
mse = mean_squared_error(test, predictions)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.00025208083587092555


In [6]:
# Combine the true and predicted values into a dataframe for comparison
results = pd.DataFrame({'True Values': test, 'Predicted Values': predictions})
print(results)

                     True Values  Predicted Values
2015-09-14              0.009631               NaN
2015-09-15              0.008412               NaN
2015-09-16              0.001118               NaN
2015-09-17             -0.021390               NaN
2015-09-18             -0.004126               NaN
...                          ...               ...
2024-05-18 00:00:00          NaN          0.005602
2024-05-19 00:00:00          NaN          0.005602
2024-05-20 00:00:00          NaN          0.005602
2024-05-21 00:00:00          NaN          0.005602
2024-05-22 00:00:00          NaN          0.005602

[6348 rows x 2 columns]


In [7]:
# Predict the return for tomorrow (the next time step)
next_prediction = Arima_fm1.forecast(steps=100)
print(f'Return prediction for tomorrow: {next_prediction[4]}')

Return prediction for tomorrow: 0.0043222010205069186
