# Abstract

There has been lots of researches on stock prediction from the past by using statistical methods. Now machine learning and deep learning techniques are being implemented and tested. Here I am going to test all three different approaches and see talk about their pros and cons.


<br><br>
First just by using stock closing price forecast --> Show that is will do good before covid and bad if external factors such as covid arise. Collect data(news, etc...) that will mitigate external factors to show importance of data collection.

# Data Extraction

## NYSE Stock data

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import yfinance as yf

In [None]:
def GetStockData(ticker_name, period, start_date, end_date):
    tickerData = yf.Ticker(ticker_name)
    df = tickerData.history(period=period, start=start_date, end=end_date)
    return df

full_nvda_df = GetStockData("NVDA", "1d", "2016-01-01", "2020-10-10")

In [None]:
nvda_df = full_nvda_df[["Close"]].copy()

## related news

# Data Cleansing

# EDA

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = nvda_df.index,
        y = nvda_df["Close"]
    )
)

fig.update_layout(
    title = "<b>Nvidia closing prices</b>",
    yaxis_title = "Dollars",
    xaxis_title = "date"
)

fig.show()

# Predictive models

## ARIMA

When you have time series data, before using ARIMA model to forecast you must check if time series data is stationary.

In [None]:
nvda_df = full_nvda_df[["Close"]].copy()

### ADF test

In [None]:
from statsmodels.tsa.stattools import adfuller, acf, pacf

dftest = adfuller(nvda_df["Close"], autolag="AIC")


dfoutput = pd.Series(dftest[0:4], index=["Test Stats", "p-value", "# Lags", "# of obs"])
for key, value in dftest[4].items():
    dfoutput[f"Critical Value ({key})"] = value
    
print(dfoutput)

In [None]:
nvda_df["1st_diff"] = nvda_df["Close"].diff()
nvda_df["2nd_diff"] = nvda_df["1st_diff"].diff()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

In [None]:
from plotly.subplots import make_subplots

In [None]:
fig = make_subplots(rows=2, cols=1)

for idx, d in enumerate(["1st_diff", "2nd_diff"]):
    fig.add_trace(
        go.Scatter(
            name = d,
            x = nvda_df.index,
            y = nvda_df[d]
        ),
        row=idx+1,col=1
    )
fig.update_layout(
    title="Differnced plots"
)
    
fig.show()

In [None]:
plot_acf(nvda_df["1st_diff"].dropna());

even with 1st order differencing, it reaches negative right away which implies that it may be over differenced. 

What to do when it gets over differenced even with 1storder differencing??
- Add MA terms

In [None]:
# Use 1st_diff value???????

dftest = adfuller(nvda_df["1st_diff"].dropna(), autolag="AIC")


dfoutput = pd.Series(dftest[0:4], index=["Test Stats", "p-value", "# Lags", "# of obs"])
for key, value in dftest[4].items():
    dfoutput[f"Critical Value ({key})"] = value
    
print(dfoutput)

In [None]:
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(nvda_df["Close"], order=(1, 1, 0)) # (p, d, q)
model_fit = model.fit(disp=0)
print(model_fit.summary())

Auto ARIMA forecast

In [None]:
import pmdarima as pm

In [None]:
model = pm.auto_arima(nvda_df.Close, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model.summary())

-----

In [None]:
train = nvda_df.Close[:1000]
test  = nvda_df.Close[1000:]

model = ARIMA(train, order=(1, 1, 0))
fit_model = model.fit(disp=-1)

fc, se, conf = fit_model.forecast(203, alpha=0.05)  # 95% conf

fc_series    = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

In [None]:
trunc_nvda_df = nvda_df[:"2018-10-01"].copy()
train = trunc_nvda_df.Close[:600]
test  = trunc_nvda_df.Close[600:]

model = ARIMA(train, order=(1, 1, 0))
fit_model = model.fit(disp=-1)

fc, se, conf = fit_model.forecast(93, alpha=0.05)  # 95% conf

fc_series    = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

## RNN

In [8]:
import tensorflow as tf
import numpy as np

In [2]:
n_inputs = 3
n_neurons = 5


X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons], dtype=tf.float32))
Wy = tf.Variable(tf.random_normal(shape=[n_neurons, n_neurons], dtype=tf.float32))

b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32))

# tanh = hyperbolic tangent activation function
Y0 = tf.tanh(tf.matmul(X0, Wx) + b)
Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b)

In [3]:
init = tf.global_variables_initializer()

In [4]:
# feeding inputs at both time steps
X0_batch = np.array([[0,1,2], [3,4,5], [6,7,8], [9,0,1]]) 
X1_batch = np.array([[9,8,7], [0,0,0], [6,5,4], [3,2,1]])

In [6]:
with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0:X0_batch, X1:X1_batch})

In [7]:
Y0_val

array([[-0.93618363,  0.99394095, -0.99989265,  0.9582004 ,  0.6393137 ],
       [-0.9999979 ,  0.99922585, -1.        ,  0.9981809 , -0.99708176],
       [-1.        ,  0.99990124, -1.        ,  0.9999223 , -0.99999917],
       [-0.9884732 , -0.9999985 , -0.9999978 , -0.99989814, -1.        ]],
      dtype=float32)

## LSTM

# Sources

- https://www.youtube.com/watch?v=hOLSGMEEwlI&t=7s&ab_channel=ComputerScience
- https://www.kaggle.com/dgawlik/nyse/notebooks
- https://www.kaggle.com/c/two-sigma-financial-news
- https://towardsdatascience.com/time-series-forecasting-predicting-stock-prices-using-an-arima-model-2e3b3080bd70


<br>

Arima
- https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/#:~:text=ARIMA%2C%20short%20for%20'Auto%20Regressive,used%20to%20forecast%20future%20values.