In [1]:
# !pip install pandas_datareader
# !pip install yfinance

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas_datareader as pdr
import yfinance as yf

2023-10-19 23:19:18.747830: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-19 23:19:18.747979: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-19 23:19:18.750426: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-19 23:19:18.948343: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [43]:
data = yf.download("AAPL", start="2013-10-16", end="2023-10-16").reset_index()
print("Data Shape after load: ", data.shape)

[*********************100%%**********************]  1 of 1 completed
Data Shape after load:  (2516, 7)


In [44]:
import sklearn.preprocessing as preprocessing

# data cleaning
data = data.ffill()
#check na
if data.isna().sum().sum() != 0:
    print("There are still NA values")
    data = data.bfill()

data = data.drop_duplicates()
data = data.reset_index(drop=True)
print("Data Shape after cleaning: ", data.shape)

cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

# perform scaling
scaler = preprocessing.MinMaxScaler()
data[cols] = scaler.fit_transform(data[cols])

# remove outliers
for col in cols:
    interQuartileRange = data[col].quantile(0.75) - data[col].quantile(0.25)
    threshold = 3 * interQuartileRange
    data = data[data[col] <= threshold]
    
print("Data Shape after removing outliers: ", data.shape)

Data Shape after cleaning:  (2516, 7)
Data Shape after removing outliers:  (2381, 7)


In [64]:
numberOfInputDays = 30
numberOfDaysToPredict = 30

# create sequence
def create_sequence(data, numberOfInputDays, numberOfDaysToPredict):
    x = []
    y = []
    for i in range(numberOfInputDays, data.shape[0] - numberOfDaysToPredict):
        x.append(data.iloc[i-numberOfInputDays:i, 1:len(cols)+1])
        y.append(data.iloc[i:i+numberOfDaysToPredict, 1:len(cols)+1])
    return np.array(x), np.array(y)

X, Y = create_sequence(data, numberOfInputDays, numberOfDaysToPredict)

print("X Shape: ", X.shape)
print("Y Shape: ", Y.shape)

# print("First Sequence i.e 30 Days: X[0]: \n", X[0]) # first sequence i.e 30 days
# print("First day of first sequence: X[0][0]: \n", X[0][0]) # first day of first sequence
# print("First feature of first day of first sequence: X[0][0][0]: \n", X[0][0][0]) # first feature of first day of first sequence

# print("First sequence i.e 7 days: Y[0]: \n", Y[0]) # first sequence i.e 7 days
# print("First day of first sequence:Y[0][0]: \n", Y[0][0]) # first day of first sequence
# print("First feature of first day of first sequence: Y[0][0][0]: \n", Y[0][0][0]) # first feature of first day of first sequence


X Shape:  (2321, 30, 6)
Y Shape:  (2321, 30, 6)


In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=False, test_size=0.2)

print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)
print("Y_train Shape: ", Y_train.shape)
print("Y_test Shape: ", Y_test.shape)


X_train Shape:  (1856, 30, 6)
X_test Shape:  (465, 30, 6)
Y_train Shape:  (1856, 30, 6)
Y_test Shape:  (465, 30, 6)


In [70]:
# apply RNN
model = keras.Sequential()
model.add(keras.layers.LSTM(units=128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(units=X_train.shape[2], activation='relu')))
model.compile(loss='mean_squared_error', optimizer='adam')


In [71]:
def trainModel(model, X_train, Y_train, X_test, Y_test, epochs=50, batch_size=32):
    # fix input shape
    # X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
    # X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])
    
    # Y_test = Y_test.reshape(Y_test.shape[0], Y_test.shape[1], Y_test.shape[2])
    # Y_train = Y_train.reshape(Y_train.shape[0], Y_train.shape[1], Y_train.shape[2])
    with tf.device('/device:GPU:0'):
        history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, shuffle=False)
    return history

In [72]:
trainModel(model, X_train, Y_train, X_test, Y_test, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f4afda66350>

In [36]:
# hyperparameters tuning
# param_grid = {
#     'batch_size': [32, 64, 128],
#     'epochs': [10, 50, 100],
#     'learning_rate': [0.1, 0.01, 0.001],
#     'optimizer': ['adam', 'rmsprop']
# }

# # kerasClassifier = keras.wrappers.scikit_learn.KerasClassifier(model, verbose=0)


# from sklearn.model_selection import GridSearchCV
# grid = GridSearchCV(estimator=kerasClassifier, param_grid=param_grid, cv=3)
# grid_result = grid.fit(X_train, Y_train)

# # print best parameters
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


In [24]:
# feature engineering
# data['Day'] = data['Date'].dt.day
# data['Month'] = data['Date'].dt.month
# data['year'] = data['Date'].dt.year
# data = data.drop(['Date'], axis=1)
# data = data.reset_index(drop=True)

# print("Data Shape after feature engineering: ", data.shape)
