In [1]:
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor



2023-08-10 14:40:01.543494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


1. get data
2. prepare data
3. standardize data
4. train/test split
5. build training and predicting pipeline: that allows to quickly train and evaluate models

In [2]:
#set variables, we start with Disney stock
stock_symbol = 'DIS'
start_date = '2022-01-01'
end_date = '2023-07-30'

In [3]:
df=yf.download(stock_symbol, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [4]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-03,155.830002,157.559998,155.360001,156.759995,156.759995,10222800
2022-01-04,158.589996,160.320007,155.550003,155.729996,155.729996,16582000
2022-01-05,156.520004,159.380005,155.100006,155.190002,155.190002,12272100
2022-01-06,156.240005,157.770004,153.679993,156.899994,156.899994,11095300
2022-01-07,156.899994,159.300003,156.289993,157.830002,157.830002,9554600


In [5]:
#delete nan if any
df = df[['Open', 'High', 'Low', 'Volume', 'Adj Close']].dropna()

In [6]:
# Separate features and target variables, 
#leave 'close' out, as it is the same as 'adj close' in most cases and could bias the model 
X = df[['Open', 'High', 'Low', 'Volume']]
y = df['Adj Close']

In [7]:
#set cutoff date for training and testing data
test_size = 60


In [8]:
# Split scaled data into training,  and testing sets
X_train = X[:-test_size]
X_test = X[-test_size:]
y_train = y[:-test_size]
y_test = y[-test_size:]

In [9]:
# Standardize features using MinMaxScaler - not necessary in this case as all features are in same scale
#but will keep this step as it transform the data into numpy array, which is the format needed for ML
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
#only apply transform test data so that data is not leaked
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)

In [29]:
#reshape data for lstm: LSTM models expect input data in a 3D shape (samples, time steps, features)
#time steps: is the number of times periods in the dataset(days, months...)that you want to take as input for prediction
# Reshape X_train_scaled

X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))

# Reshape X_test_scaled
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))


AttributeError: 'DataFrame' object has no attribute 'reshape'

In [22]:
#build and train the LSTM Model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dense(50)) #activation='relu'))
model.add(Dense(1))


In [12]:
# Define the optimizer with a learning rate
optimizer = Adam(learning_rate=0.001)  # Adjust the learning rate as needed

In [23]:
# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])

In [24]:
# Train the model with validation data and early stopping
model.fit(X_train_reshaped, y_train,
                    epochs=500,
                    batch_size=50,
                    verbose=2)  # Early stopping


Epoch 1/500
7/7 - 1s - loss: 12612.5020 - mean_absolute_error: 110.7262 - 958ms/epoch - 137ms/step
Epoch 2/500
7/7 - 0s - loss: 12142.7754 - mean_absolute_error: 108.6586 - 19ms/epoch - 3ms/step
Epoch 3/500
7/7 - 0s - loss: 9978.2549 - mean_absolute_error: 97.9585 - 19ms/epoch - 3ms/step
Epoch 4/500
7/7 - 0s - loss: 5944.3516 - mean_absolute_error: 69.1051 - 18ms/epoch - 3ms/step
Epoch 5/500
7/7 - 0s - loss: 3781.5950 - mean_absolute_error: 51.5285 - 18ms/epoch - 3ms/step
Epoch 6/500
7/7 - 0s - loss: 3603.2231 - mean_absolute_error: 53.5754 - 18ms/epoch - 3ms/step
Epoch 7/500
7/7 - 0s - loss: 3056.9631 - mean_absolute_error: 46.7836 - 17ms/epoch - 2ms/step
Epoch 8/500
7/7 - 0s - loss: 2795.5774 - mean_absolute_error: 42.2256 - 17ms/epoch - 2ms/step
Epoch 9/500
7/7 - 0s - loss: 2630.8071 - mean_absolute_error: 40.3053 - 17ms/epoch - 2ms/step
Epoch 10/500
7/7 - 0s - loss: 2453.2881 - mean_absolute_error: 39.0687 - 18ms/epoch - 3ms/step
Epoch 11/500
7/7 - 0s - loss: 2319.6519 - mean_absol

Epoch 89/500
7/7 - 0s - loss: 111.0162 - mean_absolute_error: 7.1942 - 17ms/epoch - 2ms/step
Epoch 90/500
7/7 - 0s - loss: 108.4906 - mean_absolute_error: 7.0261 - 17ms/epoch - 2ms/step
Epoch 91/500
7/7 - 0s - loss: 105.7346 - mean_absolute_error: 6.9299 - 17ms/epoch - 2ms/step
Epoch 92/500
7/7 - 0s - loss: 103.1644 - mean_absolute_error: 6.9373 - 17ms/epoch - 2ms/step
Epoch 93/500
7/7 - 0s - loss: 101.1023 - mean_absolute_error: 6.7268 - 17ms/epoch - 2ms/step
Epoch 94/500
7/7 - 0s - loss: 97.4409 - mean_absolute_error: 6.7044 - 17ms/epoch - 2ms/step
Epoch 95/500
7/7 - 0s - loss: 94.3032 - mean_absolute_error: 6.6270 - 17ms/epoch - 2ms/step
Epoch 96/500
7/7 - 0s - loss: 91.0931 - mean_absolute_error: 6.3084 - 17ms/epoch - 2ms/step
Epoch 97/500
7/7 - 0s - loss: 87.4494 - mean_absolute_error: 6.4618 - 17ms/epoch - 2ms/step
Epoch 98/500
7/7 - 0s - loss: 83.5892 - mean_absolute_error: 6.1688 - 16ms/epoch - 2ms/step
Epoch 99/500
7/7 - 0s - loss: 79.4222 - mean_absolute_error: 6.0528 - 17ms/

Epoch 178/500
7/7 - 0s - loss: 5.7161 - mean_absolute_error: 1.7783 - 16ms/epoch - 2ms/step
Epoch 179/500
7/7 - 0s - loss: 5.5522 - mean_absolute_error: 1.7484 - 17ms/epoch - 2ms/step
Epoch 180/500
7/7 - 0s - loss: 5.5161 - mean_absolute_error: 1.7491 - 16ms/epoch - 2ms/step
Epoch 181/500
7/7 - 0s - loss: 5.5125 - mean_absolute_error: 1.7506 - 17ms/epoch - 2ms/step
Epoch 182/500
7/7 - 0s - loss: 5.5640 - mean_absolute_error: 1.7539 - 17ms/epoch - 2ms/step
Epoch 183/500
7/7 - 0s - loss: 5.5313 - mean_absolute_error: 1.7500 - 18ms/epoch - 3ms/step
Epoch 184/500
7/7 - 0s - loss: 5.5472 - mean_absolute_error: 1.7511 - 17ms/epoch - 2ms/step
Epoch 185/500
7/7 - 0s - loss: 5.5738 - mean_absolute_error: 1.7713 - 17ms/epoch - 2ms/step
Epoch 186/500
7/7 - 0s - loss: 5.4510 - mean_absolute_error: 1.7491 - 17ms/epoch - 2ms/step
Epoch 187/500
7/7 - 0s - loss: 5.3406 - mean_absolute_error: 1.7206 - 17ms/epoch - 2ms/step
Epoch 188/500
7/7 - 0s - loss: 5.3039 - mean_absolute_error: 1.7241 - 17ms/epoch

7/7 - 0s - loss: 4.2904 - mean_absolute_error: 1.5697 - 16ms/epoch - 2ms/step
Epoch 268/500
7/7 - 0s - loss: 4.3244 - mean_absolute_error: 1.5742 - 16ms/epoch - 2ms/step
Epoch 269/500
7/7 - 0s - loss: 4.3236 - mean_absolute_error: 1.5918 - 16ms/epoch - 2ms/step
Epoch 270/500
7/7 - 0s - loss: 4.3414 - mean_absolute_error: 1.5783 - 17ms/epoch - 2ms/step
Epoch 271/500
7/7 - 0s - loss: 4.2805 - mean_absolute_error: 1.5716 - 17ms/epoch - 2ms/step
Epoch 272/500
7/7 - 0s - loss: 4.2967 - mean_absolute_error: 1.5707 - 16ms/epoch - 2ms/step
Epoch 273/500
7/7 - 0s - loss: 4.3954 - mean_absolute_error: 1.6010 - 17ms/epoch - 2ms/step
Epoch 274/500
7/7 - 0s - loss: 4.4230 - mean_absolute_error: 1.6030 - 16ms/epoch - 2ms/step
Epoch 275/500
7/7 - 0s - loss: 4.3837 - mean_absolute_error: 1.5780 - 16ms/epoch - 2ms/step
Epoch 276/500
7/7 - 0s - loss: 4.2820 - mean_absolute_error: 1.5729 - 17ms/epoch - 2ms/step
Epoch 277/500
7/7 - 0s - loss: 4.3076 - mean_absolute_error: 1.5927 - 17ms/epoch - 2ms/step
Ep

Epoch 357/500
7/7 - 0s - loss: 3.8048 - mean_absolute_error: 1.5035 - 17ms/epoch - 2ms/step
Epoch 358/500
7/7 - 0s - loss: 3.8485 - mean_absolute_error: 1.4935 - 17ms/epoch - 2ms/step
Epoch 359/500
7/7 - 0s - loss: 3.8251 - mean_absolute_error: 1.4962 - 17ms/epoch - 2ms/step
Epoch 360/500
7/7 - 0s - loss: 3.7534 - mean_absolute_error: 1.4821 - 16ms/epoch - 2ms/step
Epoch 361/500
7/7 - 0s - loss: 3.7723 - mean_absolute_error: 1.4878 - 17ms/epoch - 2ms/step
Epoch 362/500
7/7 - 0s - loss: 3.7568 - mean_absolute_error: 1.4838 - 17ms/epoch - 2ms/step
Epoch 363/500
7/7 - 0s - loss: 3.7270 - mean_absolute_error: 1.4760 - 17ms/epoch - 2ms/step
Epoch 364/500
7/7 - 0s - loss: 3.7758 - mean_absolute_error: 1.4875 - 16ms/epoch - 2ms/step
Epoch 365/500
7/7 - 0s - loss: 3.7674 - mean_absolute_error: 1.4864 - 17ms/epoch - 2ms/step
Epoch 366/500
7/7 - 0s - loss: 3.7568 - mean_absolute_error: 1.4758 - 17ms/epoch - 2ms/step
Epoch 367/500
7/7 - 0s - loss: 3.7399 - mean_absolute_error: 1.4831 - 17ms/epoch

7/7 - 0s - loss: 3.4671 - mean_absolute_error: 1.4287 - 17ms/epoch - 2ms/step
Epoch 447/500
7/7 - 0s - loss: 3.4827 - mean_absolute_error: 1.4230 - 17ms/epoch - 2ms/step
Epoch 448/500
7/7 - 0s - loss: 3.4686 - mean_absolute_error: 1.4273 - 17ms/epoch - 2ms/step
Epoch 449/500
7/7 - 0s - loss: 3.4930 - mean_absolute_error: 1.4324 - 17ms/epoch - 2ms/step
Epoch 450/500
7/7 - 0s - loss: 3.5823 - mean_absolute_error: 1.4724 - 17ms/epoch - 2ms/step
Epoch 451/500
7/7 - 0s - loss: 3.6021 - mean_absolute_error: 1.4697 - 16ms/epoch - 2ms/step
Epoch 452/500
7/7 - 0s - loss: 3.4443 - mean_absolute_error: 1.4243 - 17ms/epoch - 2ms/step
Epoch 453/500
7/7 - 0s - loss: 3.5181 - mean_absolute_error: 1.4413 - 17ms/epoch - 2ms/step
Epoch 454/500
7/7 - 0s - loss: 3.5137 - mean_absolute_error: 1.4437 - 17ms/epoch - 2ms/step
Epoch 455/500
7/7 - 0s - loss: 3.4566 - mean_absolute_error: 1.4308 - 16ms/epoch - 2ms/step
Epoch 456/500
7/7 - 0s - loss: 3.4599 - mean_absolute_error: 1.4321 - 17ms/epoch - 2ms/step
Ep

<keras.callbacks.History at 0x7fd48028bd00>

#plot training history
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [25]:
#make predictions and evaluate model
y_pred = model.predict(X_test_reshaped)



In [26]:
y_test.shape


(60,)

In [27]:
y_pred_flat = y_pred.flatten()

In [28]:
#evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

ValueError: Found array with dim 3. None expected <= 2.

In [18]:
#define a function to calculate MAPE, my objective is to achieve +/- 5% accuracy
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100


In [19]:
mape = mean_absolute_percentage_error(y_test, y_pred)
mape

4.55570928383129

In [44]:
y_test, y_pred

(Date
 2023-06-15    92.940002
 2023-06-16    91.320000
 2023-06-20    89.750000
 2023-06-21    88.639999
 2023-06-22    88.489998
 2023-06-23    88.099998
 2023-06-26    88.699997
 2023-06-27    89.059998
 2023-06-28    88.830002
 2023-06-29    88.949997
 2023-06-30    89.279999
 2023-07-03    90.500000
 2023-07-05    89.790001
 2023-07-06    88.739998
 2023-07-07    88.639999
 2023-07-10    88.099998
 2023-07-11    89.489998
 2023-07-12    90.150002
 2023-07-13    90.470001
 2023-07-14    88.620003
 2023-07-17    85.559998
 2023-07-18    85.949997
 2023-07-19    87.040001
 2023-07-20    86.209999
 2023-07-21    87.180000
 2023-07-24    86.599998
 2023-07-25    85.629997
 2023-07-26    85.860001
 2023-07-27    85.360001
 2023-07-28    86.129997
 Name: Adj Close, dtype: float64,
 array([[92.645164],
        [93.11324 ],
        [90.739044],
        [89.72122 ],
        [88.35033 ],
        [87.50226 ],
        [88.055435],
        [89.01842 ],
        [89.41308 ],
        [87.54196 ],
