In [1]:
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA
from tensorflow.keras.layers import LSTM, Dense, Dropout, Reshape
from matplotlib import pyplot as plt
import pandas_ta as ta
import plotly.tools as tls
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from datetime import date
from datetime import timedelta
from tensorflow.python.keras.models import load_model
from sklearn.metrics import mean_squared_error

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
# Specify stock name and corresponding ticker logo
stock_name = "Walmart"
ticker = 'WMT'

In [4]:
# get data from yahoo finance for last 10 years
df_stockData = pdr.DataReader(ticker, data_source='yahoo', start=str(date.today() - timedelta(days=365 * 10)),
                              end=str(date.today() - timedelta(days=1)))
# Add indicators using pandas ta lib
# add Exponential Moving Average (EMA) indicator
df_stockData.ta.ema(close='Close', length=3, append=True)
# add Relative Strength Index (RSI) Indicator
df_stockData.ta.rsi(close='Close', length=7, append=True)
# add Average Directional Index (ADX) indicator
df_stockData.ta.adx(high='High', low='Low', close='Close', length=3, append=True)
# Add Moving Average Convergence Divergence (MACD) indicator
df_stockData.ta.macd(close='Close', append=True)
# Add On-Balance Volume indicator
df_stockData.ta.obv(close='Close', volume='Volume', append=True)
# Add Daily Percent Return
df_stockData.ta.percent_return(length=1, append=True)
# Add Stochastic Momentum Index (SMI)
df_stockData.ta.smi(close='Close', append=True)
# Average of open, high, low and close price
df_stockData.ta.ohlc4(open='Open', high='High', low='Low', close='Close', append=True)

df_stockData.dropna(inplace=True)
# cols = list(df_stockData.columns)

In [5]:
def performRFE():
    rfecv = RFECV(
        estimator=RandomForestRegressor(),
        min_features_to_select=1,
        step=2,
        n_jobs=-1,
        scoring="r2",
        cv=5,
    )
    cols_rfe = list(df_stockData.columns)
    # Exclude close price from RFE process
    cols_rfe.remove('Close')
    rfecv.fit(StandardScaler().fit_transform(df_stockData.loc[:, cols_rfe]),
              np.array(df_stockData['Close'].values).reshape(-1, ))

    # get importance scores for selected features
    feature_importance_df = pd.DataFrame()
    feature_importance_df['Features'] = list(df_stockData.loc[:, cols_rfe].columns[rfecv.support_])
    feature_importance_df['Importance'] = rfecv.estimator_.feature_importances_
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(8, 4))
    plt.barh(y=feature_importance_df['Features'], width=feature_importance_df['Importance'], color='green')
    plt.title("RFE Features importance score for {}".format(ticker))
    plt.xlabel("Importance score")
    plt.show()
    # return the selected feature in descending order of importance
    return list(feature_importance_df['Features'].values)



In [None]:
selected_features = performRFE()
if len(selected_features) > 2:
    selected_features = selected_features[:2]
# Add back selected features
selected_features.append('Close')

# Normalize the data using range (0,1)
print("Selected Features: {}".format(selected_features))
scaler_x = MinMaxScaler(feature_range=(0, 1))
scaler_y = MinMaxScaler(feature_range=(0, 1))
df_stockData_x = scaler_x.fit_transform(df_stockData[selected_features])
df_stockData_y = scaler_y.fit_transform(np.array(df_stockData['Close'].values).reshape(-1, 1))

In [7]:
# Perform PCA to get new components while maintaining 95% covariance in data
pca = PCA(n_components=0.95)
df_stockData_x = pca.fit_transform(df_stockData_x)

In [8]:
# prep data using rolling window technique (21 days input) -. (3 days predicted output)
def prepData(data_x, data_y):
    x = []
    y = []
    for i in range(numPastDays, len(data_x) - numFutureDays + 1):
        x.append(data_x[i - numPastDays:i, :])
        y.append(data_y[i:i + numFutureDays])
    x = np.array(x)
    y = np.array(y)
    return x, y

In [9]:
# use prior 21 days input to predict close price of next 3 days
numPastDays, numFutureDays = 21, 3
# Reserve last 3 days as train set and use other remaining data as test set
train_size = len(df_stockData) - numFutureDays
x_train = df_stockData_x[0:train_size]
y_train = df_stockData_y[0:train_size]
x_test = df_stockData_x[train_size - numPastDays - numFutureDays + 1:]
y_test = df_stockData_y[train_size - numPastDays - numFutureDays + 1:]
x_train, y_train = prepData(x_train, y_train)
x_test, y_test = prepData(x_test, y_test)

In [None]:
# build LSTM model for prediction
model = Sequential()
model.add(
    LSTM(200, activation='tanh', recurrent_activation='sigmoid', input_shape=(x_train.shape[1], x_train.shape[2]),
         return_sequences=True))
model.add(LSTM(100, activation='tanh', recurrent_activation='sigmoid', return_sequences=False))
model.add(Dense(50))
# output 3 predicted values (close price)
model.add(Dense(y_train.shape[1]))
model.add(Reshape((y_train.shape[1], y_train.shape[2])))
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=[tf.keras.metrics.MeanSquaredError()])
model.summary()

In [None]:
# Use callback function to save model with lowest validation loss and avoid using overfitted models (will create a file to save model)
checkpoint = ModelCheckpoint("model.h5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False,
                             mode='auto', period=1)
# Implement automatic learning rate reduction when validation losses doesn't decrease after 7 epochs
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001, mode='auto')

#train model using training data over 30 epochs
history = model.fit(x_train, y_train, epochs=30, batch_size=5, validation_split=0.2, verbose=1,
                    callbacks=[checkpoint, lr_reducer])

In [None]:
# load model with lowest validation loss
model = load_model("model.h5")
# evaluate loss on test set
model.evaluate(x_test, y_test)

In [13]:
# get predictions using test set and unscale it
prediction_results = model.predict(x_test)
unscaled_pred = scaler_y.inverse_transform(prediction_results[-1].reshape(-1, 1))
unscaled_target = scaler_y.inverse_transform(y_test[-1].reshape(-1, 1))

In [None]:
# prepare dataframe to view the actual and predicted results for last 3 days
comparedDF = df_stockData.filter(['Close'])[train_size:]
comparedDF['Predicted'] = unscaled_pred
comparedDF

In [None]:
prediction_graph = plt.figure(figsize=(16, 8))
plt.title("Prediction results for {}({})".format(stock_name, ticker))
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price (USD)', fontsize=18)
plt.plot(df_stockData[train_size - 20:train_size + 1]['Close'])
plt.plot(comparedDF[['Close', 'Predicted']])
plt.legend(['Training', 'Test (actual)', 'Predictions'], loc='lower left')
plt.show()
prediction_graph = tls.mpl_to_plotly(prediction_graph)
prediction_graph.write_image("{}_prediction.png".format(ticker))

In [None]:
# view the Root Mean Square Error (RMSE)
print("RMSE : {}".format(mean_squared_error(unscaled_target, unscaled_pred, squared=False)))

In [None]:
if unscaled_pred[0] < unscaled_pred[numFutureDays - 1] and unscaled_pred[numFutureDays - 1] > df_stockData.iloc[
                                                                                              train_size - 1:train_size,
                                                                                              3:4].values:
    print("Bullish phase predicted for {}".format(stock_name))
else:
    print("Bearish phase predicted for {}".format(stock_name))