# Amazon Stock Prediction Using Yahoo Finance + LSTM

## Dependencies

In [1]:
#Imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.metrics import RootMeanSquaredError
from keras.layers import Dropout
from keras.callbacks import LambdaCallback
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from keras.optimizers import Adam
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler

g:\programdata\anaconda3\envs\cs-349-final-project\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
g:\programdata\anaconda3\envs\cs-349-final-project\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


## Setup

Download data using the Yahoo Finance API

In [2]:
# Read Data using yahoo finance
data = yf.download(
    tickers=['AMZN'],
    # use "period" instead of start/end
    # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
    # (optional, default is '1mo')
    period="max",
    # fetch data by interval (including intraday if period < 60 days)
    # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
    # (optional, default is '1d')
    interval="1d")

[*********************100%***********************]  1 of 1 completed


In [4]:
print(data.head)

<bound method NDFrame.head of                    Open         High          Low        Close    Adj Close  \
Date                                                                          
1997-05-15     2.437500     2.500000     1.927083     1.958333     1.958333   
1997-05-16     1.968750     1.979167     1.708333     1.729167     1.729167   
1997-05-19     1.760417     1.770833     1.625000     1.708333     1.708333   
1997-05-20     1.729167     1.750000     1.635417     1.635417     1.635417   
1997-05-21     1.635417     1.645833     1.375000     1.427083     1.427083   
...                 ...          ...          ...          ...          ...   
2021-05-28  3242.000000  3247.989990  3219.699951  3223.070068  3223.070068   
2021-06-01  3243.500000  3250.979980  3209.060059  3218.649902  3218.649902   
2021-06-02  3223.100098  3235.000000  3208.000000  3233.989990  3233.989990   
2021-06-03  3204.229980  3214.439941  3184.030029  3187.010010  3187.010010   
2021-06-04  3212.00000

## Preprocessing

1. Apply MinMaxScalar, or other normalizing methods of your choice. Since we only use the close prices, we can directly 
apply it before splitting into training and testing

In [5]:
closes = data['Close'].values
scaler = MinMaxScaler()
closes_scaled = scaler.fit_transform(closes.reshape(-1, 1))
closes_scaled = closes_scaled.reshape(-1)

In [6]:
print(closes[:5])
print(closes_scaled[:5])

[1.95833302 1.72916698 1.70833302 1.63541698 1.42708302]
[1.59345999e-04 9.44274388e-05 8.85255550e-05 6.78697723e-05
 8.85255550e-06]


2. Create rolling prices. Continuously create arrays of a set window size and append to a new dataframe

In [7]:
num_rolling = 30

#Create the 31 day rolling prices
header = ["day " + str(i) for i in range(1, (num_rolling + 1 + 1))]
df = []
for i in range(num_rolling, len(closes)):
    arr = []
    for j in range(i-num_rolling, i+1):
        arr.append(closes_scaled[j])
    df.append(np.array(arr))
df = pd.DataFrame(df, columns=header)

In [8]:
print(df.head)

<bound method NDFrame.head of          day 1     day 2     day 3     day 4     day 5     day 6     day 7  \
0     0.000159  0.000094  0.000089  0.000068  0.000009  0.000000  0.000030   
1     0.000094  0.000089  0.000068  0.000009  0.000000  0.000030  0.000053   
2     0.000089  0.000068  0.000009  0.000000  0.000030  0.000053  0.000038   
3     0.000068  0.000009  0.000000  0.000030  0.000053  0.000038  0.000031   
4     0.000009  0.000000  0.000030  0.000053  0.000038  0.000031  0.000030   
...        ...       ...       ...       ...       ...       ...       ...   
6019  0.962604  0.954834  0.944261  0.952004  0.936995  0.946015  0.965312   
6020  0.954834  0.944261  0.952004  0.936995  0.946015  0.965312  0.967700   
6021  0.944261  0.952004  0.936995  0.946015  0.965312  0.967700  0.979335   
6022  0.952004  0.936995  0.946015  0.965312  0.967700  0.979335  0.982963   
6023  0.936995  0.946015  0.965312  0.967700  0.979335  0.982963  0.981861   

         day 8     day 9    day 1

3. Split into Cells in LSTM format. Specfically, for x you should have data of shape (samples, window_size, feature_size)
Create both x and y.

In [9]:
data_x = []
data_y = []
def saveToLSTMData(x):
    datapoint = []
    for i in range(1, num_rolling + 1):
        timestamp = []
        timestamp.append(x['day ' + str(i)])
        datapoint.append(timestamp)
    data_x.append(datapoint)
    data_y.append(x['day ' + str(num_rolling + 1)])

df.apply(lambda x : saveToLSTMData(x), axis=1)
data_x = np.array(data_x)
data_y = np.array(data_y)

In [10]:
print(data_x.shape)
print(data_y.shape)

(6024, 30, 1)
(6024,)


4. Split the data into training and testing.

In [11]:
testing_split = 0.2
testing_split = int(len(data_x)*testing_split)
train_x = data_x [:-testing_split]
test_x = data_x[-testing_split:]
train_y = data_y [:-testing_split]
test_y = data_y[-testing_split:]

## Train LSTM Model

1. Build the model. Select favorable set of parameters. A good rule of thumb for the number of hidden nodes is:
    Hidden_Nodes = (Number of Samples) / (alpha * (input_size + output_size)), where alpha can be some number from 2-10.

In [12]:
dim = data_x.shape

# Create Model
model = Sequential()
dense_nodes = 5
hidden_nodes = int(len(train_x) / (2 * (dense_nodes + num_rolling)))
print("Suggested Number of Hidden Node is:", hidden_nodes)
model.add(LSTM(hidden_nodes, return_sequences = False, input_shape = (dim[1], dim[2])))
model.add(Dropout(0.2))
model.add(Dense(dense_nodes))
model.add(Dense(1)) # 1 output: Price

Suggested Number of Hidden Node is: 68


In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 68)                19040     
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 68)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 345       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6         
Total params: 19,391
Trainable params: 19,391
Non-trainable params: 0
_________________________________________________________________


2. Train the model. Record the test loss along the training process.


In [None]:
# Train
epochs = 100
train_scores = []
test_scores = []
train_loss = LambdaCallback(on_epoch_end=lambda batch, logs: train_scores.append(logs['loss']))
earlystopper = EarlyStopping(monitor='loss', patience=epochs/10)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-8), loss='mae', metrics=[RootMeanSquaredError()])
test_loss = LambdaCallback(on_epoch_end=lambda batch, logs: test_scores.append(model.evaluate(test_x, test_y)[0]))
model.fit(train_x, train_y, batch_size=50, epochs=epochs, callbacks=[train_loss, test_loss, earlystopper])

3. Prediction. Need to inverse the scaling applied at the begining.

In [None]:
result = model.evaluate(test_x,test_y)[1]
predictions = model.predict(test_x)
predictions = scaler.inverse_transform(predictions)
test_y = scaler.inverse_transform(test_y.reshape(-1,1))
print(result)

 
Learning Curve

In [None]:
plt.figure()
plt.title("Testing RMSE: " + str(result))
plt.grid()
plt.suptitle("Learning Curve")
plt.ylabel("loss")
plt.xlabel("epochs")
plt.plot(np.linspace(0,len(train_scores),len(train_scores)), train_scores, linewidth=1, color="r",
         label="Training loss")
plt.plot(np.linspace(0,len(test_scores),len(test_scores)), test_scores, linewidth=1, color="b",
          label="Testing loss")
legend = plt.legend(loc='upper right', shadow=True, fontsize='medium')
legend.get_frame().set_facecolor('C0')

plt.show()

plt.clf()

 
Prediction Vs. Actual

In [None]:
plt.title("Predicted vs Actual")
plt.grid()
plt.ylabel("value")
plt.xlabel("samples")
plt.plot(np.linspace(0,len(predictions),len(predictions)), predictions, linewidth=1, color="r",
         label="Predictions")
plt.plot(np.linspace(0,len(test_y),len(test_y)), test_y, linewidth=1, color="b",
          label="Actuals")
legend = plt.legend(loc='upper right', shadow=True, fontsize='medium')
legend.get_frame().set_facecolor('C0')

plt.show()

## Conclusion

This seems like a good prediction scheme at first glance, however, it really did not learn anythin that useful. The model
learns to simply adhere to the previous day's price since it's probably gonna be super close. Therefore, this should not
be used as a tool to actually predict.