In [73]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import InputLayer, LSTM, Dense, Conv1D, Flatten, GRU, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError, mean_squared_error as mse, mean_absolute_percentage_error as mape, mean_absolute_error as mae
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras import regularizers
from tpot import TPOTRegressor
from bayes_opt import BayesianOptimization
import absl.logging

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn import metrics
from statsmodels.tsa.seasonal import seasonal_decompose
from tempfile import TemporaryFile

import os
import re
import time
import datetime
import statistics
import random
import pandas as pd
import seaborn as sns
import keras_tuner as kt
from pandas_datareader import data as pdr
from datetime import date, timedelta
from copy import deepcopy
import yfinance as yf
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px  # (version 4.7.0 or higher)
import plotly.graph_objects as go

In [74]:
# global variables
timeframe = 9000
enable_pca = 0
standard_scaling = 0
future_window = 10
win_size = 5
epochs = 10
batch_size = 128
thresh = 0.7

In [75]:
## we have a multi-index: let's collapse that so we have usable, single index column names
def collapse_columns(df):
    df = df.copy()
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.to_series().apply(lambda x: "__".join(x))
    return df

In [76]:
def set_verbosity():
    absl.logging.set_verbosity(absl.logging.ERROR)
    tf.compat.v1.logging.set_verbosity(30)

In [77]:
end = date.today()
start = end - timedelta(days=9000)
yf.pdr_override()

data = yf.download('AAPL', start, end)
data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-07-06,0.263393,0.271205,0.260045,0.271205,0.230828,270950400
1998-07-07,0.271205,0.275670,0.267857,0.272321,0.231777,241472000
1998-07-08,0.274554,0.294085,0.273996,0.290737,0.247452,932814400
1998-07-09,0.294085,0.300223,0.280692,0.282924,0.240802,566608000
1998-07-10,0.287388,0.291295,0.283482,0.286272,0.243651,302523200
...,...,...,...,...,...,...
2023-02-14,152.119995,153.770004,150.860001,153.199997,153.199997,61707600
2023-02-15,153.110001,155.500000,152.880005,155.330002,155.330002,65669300
2023-02-16,153.509995,156.330002,153.350006,153.710007,153.710007,68167900
2023-02-17,152.350006,153.000000,150.850006,152.550003,152.550003,59095900


In [78]:
def load_frame(days, stock):   
    end = date.today()
    start = end - timedelta(days=days)
    yf.pdr_override()

    data = yf.download(stock, start, end)


    data = data.resample('D').first() # ALWAYS resample before shifting so we don't get the wrong shift amount if there are missing rows/timestamps
    data = collapse_columns(data)
    data = data.dropna(how='any', axis='rows')

    assert data.isna().any().any() == False # Make sure there are no NaNs left

    return data

In [79]:
# There's alot of multi-collinearity in this data. Ideally, we should remove colinear features, as they will 
# Skew results
# After calling this function, simply remove these correlated columns from the dataset (Better to not have any of them)
# PCA is another option for removing it

def remove_correlation(data, threshold):
    correlated_cols = set()
    correlation_matrix = data.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix[i, j]) > threshold:
                colname = correlation_matrix.columns[i]
                correlated_cols.add(colname)

    return correlated_cols

def remove_correlations_PCA(X):

    X_std = StandardScaler().fit_transform(X)
    pca = PCA().fit_transform(X_std)

    # Use these two indicators to see which variables are having the most effect on the system
    # Choose the high few impacts, and put them into the new PCA
    print(np.cumsum(pca.explained_variance_ratio))
    print(pca.explained_variance_ratio)

    # Change num_componenets to be the number of useful variables observed above
    pca = PCA(num_components=1).fit_transform(X_std)
    return pca

In [80]:
# This class is the final part of the preprocessing pipeline, and is used to remove columns that are unnecessary
class FeatureDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        X.drop(['Volume', 'Adj Close'], axis=1, inplace=True, errors='ignore')
        if enable_pca:
            X = remove_correlations_PCA(X)
        return X

In [81]:
# This class is the final part of the preprocessing pipeline, and is used to remove columns that are unnecessary
class FeatureScaler(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        if standard_scaling:
            open = StandardScaler(feature_range=(0, 1))
            high = StandardScaler(feature_range=(0, 1))
            low = StandardScaler(feature_range=(0, 1))
            close = StandardScaler(feature_range=(0, 1))

        else:
            open = MinMaxScaler(feature_range=(0, 1))
            high = MinMaxScaler(feature_range=(0, 1))
            low = MinMaxScaler(feature_range=(0, 1))
            close = MinMaxScaler(feature_range=(0, 1))

        X['Open'] = open.fit_transform(X[['Open']])
        X['High'] = high.fit_transform(X[['High']])
        X['Low'] = low.fit_transform(X[['Low']])
        X['Close'] = close.fit_transform(X[['Close']])
        
        return X, open, high, low, close

In [82]:
def create_dataset_tpot(data, future_window, win_size):
    
    np_data = data.to_numpy()
    X = []
    y = []
    future_X = []
    for i in range(len(np_data)-(win_size+future_window)):
        row = [r for r in np_data[i:i+win_size]]
        X.append(list(np.concatenate(row).flat))
        label = np_data[i+win_size+future_window][3]
        y.append(label)

    for i in range(len(np_data) - win_size):
        row = [r for r in np_data[i:i+win_size]]
        future_X.append(list(np.concatenate(row).flat))

    return np.array(X), np.array(y), np.array(future_X)

In [83]:
def create_dataset(data, future_window, win_size):
    
    np_data = data.to_numpy()
    X = []
    y = []
    future_X = []
    for i in range(len(np_data)-(win_size+future_window)):
        row = [r for r in np_data[i:i+win_size]]
        X.append(row)
        label = np_data[i+win_size+future_window]
        y.append(label)

    for i in range(len(np_data) - win_size):
        row = [r for r in np_data[i:i+win_size]]
        future_X.append(row)

    return np.array(X), np.array(y), np.array(future_X)

In [84]:
def kt_model(hp):

    hp_activation = hp.Choice('activation', values=['relu', 'tanh'])
    hp_learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    hp_reg = hp.Float("reg", min_value=1e-4, max_value=1e-2, sampling="log")
    hp_dropout = hp.Float("dropout", min_value=1e-3, max_value=0.5, sampling="linear")
    hp_neuron_pct = hp.Float('NeuronPct', min_value=1e-3, max_value=1.0, sampling='linear')
    hp_neuron_shrink = hp.Float('NeuronShrink', min_value=1e-3, max_value=1.0, sampling='linear')
    
    hp_l_layer_1 = hp.Int('l_layer_1', min_value=1, max_value=100, step=10)
    hp_l_layer_2 = hp.Int('l_layer_2', min_value=1, max_value=100, step=10)
    hp_max_neurons = hp.Int('neurons', min_value=10, max_value=200, step=10)

    neuron_count = int(hp_neuron_pct * hp_max_neurons)
    layers = 0

    model = Sequential()
    model.add(InputLayer((X.shape[1], X.shape[2])))
    model.add(LSTM(hp_l_layer_1, return_sequences=True, activity_regularizer=regularizers.l1(hp_reg)))
    model.add(Dropout(hp_dropout))
    model.add(LSTM(hp_l_layer_2, return_sequences=True, activity_regularizer=regularizers.l1(hp_reg)))
    model.add(Dropout(hp_dropout))
    model.add(Flatten())

    while neuron_count > 5 and layers < 5:

        model.add(Dense(units=neuron_count, activation=hp_activation))
        model.add(Dropout(hp_dropout))
        layers += 1
        neuron_count = int(neuron_count * hp_neuron_shrink)

    model.add(Dense(4, 'linear'))

    model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=hp_learning_rate), 
                metrics=['mean_squared_error', 'mean_absolute_error', 'mean_absolute_percentage_error'])

    return model
    


In [85]:
def train_model(stocks, future_window):

    set_verbosity()
    pipe = Pipeline([('Dropper', FeatureDropper()), ('Scaler', FeatureScaler())])
    random.shuffle(stocks)
    cache = {}

    for stock in stocks:
        frame = load_frame(timeframe, stock)
        frame, open, high, low, close = pipe.fit_transform(frame)
        dates = frame.index[win_size:]
        X, y, future_X = create_dataset(frame, future_window, win_size)
        cache[stock] = [X, y,frame, open, high, low, close, future_X, dates]

    X_train = cache.get(stocks[0])[0]
    y_train = cache.get(stocks[0])[1]
    X_val = cache.get(stocks[int(len(stocks) * thresh)])[0]
    y_val = cache.get(stocks[int(len(stocks) * thresh)])[1]

    for i in range(1, int(len(stocks) * thresh)):
        X_train = np.concatenate([X_train, cache.get(stocks[i])[0]], axis=0)
        y_train = np.concatenate([y_train, cache.get(stocks[i])[1]], axis=0)

    for i in range(int(len(stocks) * thresh), len(stocks)):
        X_val = np.concatenate([X_val, cache.get(stocks[i])[0]], axis=0)
        y_val = np.concatenate([y_val, cache.get(stocks[i])[1]], axis=0)

    time = str(datetime.datetime.now())
    time = re.sub("\s", "_", time)
    time = re.sub(":", "_", time)
    time = re.sub("-", "_", time)
    time = re.sub("\.", "_", time)

    tuner = kt.Hyperband(kt_model, objective='mean_squared_error', max_epochs=epochs, factor=3, directory='models/kt_dir', 
            project_name='kt_model_' + time, overwrite=True)

    monitor = EarlyStopping(monitor='loss', min_delta=1e-5, patience=5, verbose=0, mode='auto', 
                    restore_best_weights=True)

    tuner.search(cache.get(stocks[0])[0], cache.get(stocks[0])[1], verbose=1, epochs=epochs, batch_size=batch_size, callbacks=[monitor])

    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    model = tuner.hypermodel.build(best_hps)
    history = model.fit(X_train, y_train, verbose=1, epochs=epochs, validation_data=(X_val, y_val), callbacks=[monitor],
                    batch_size=batch_size)
    model.save('models/model_' + time)

In [86]:
def train_model_tpot(stocks, future_window):

    set_verbosity()
    pipe = Pipeline([('Dropper', FeatureDropper()), ('Scaler', FeatureScaler())])
    random.shuffle(stocks)
    cache = {}

    for stock in stocks:
        frame = load_frame(timeframe, stock)
        frame, open, high, low, close = pipe.fit_transform(frame)
        dates = frame.index[win_size:]
        X, y, future_X = create_dataset_tpot(frame, future_window, win_size)
        cache[stock] = [X, y,frame, open, high, low, close, future_X, dates]

    X_train = cache.get(stocks[0])[0]
    y_train = cache.get(stocks[0])[1]
    X_val = cache.get(stocks[int(len(stocks) * thresh)])[0]
    y_val = cache.get(stocks[int(len(stocks) * thresh)])[1]

    for i in range(1, int(len(stocks) * thresh)):
        X_train = np.concatenate([X_train, cache.get(stocks[i])[0]], axis=0)
        y_train = np.concatenate([y_train, cache.get(stocks[i])[1]], axis=0)

    for i in range(int(len(stocks) * thresh), len(stocks)):
        X_val = np.concatenate([X_val, cache.get(stocks[i])[0]], axis=0)
        y_val = np.concatenate([y_val, cache.get(stocks[i])[1]], axis=0)

    time = str(datetime.datetime.now())
    time = re.sub("\s", "_", time)
    time = re.sub(":", "_", time)
    time = re.sub("-", "_", time)
    time = re.sub("\.", "_", time)

    teapot = TPOTRegressor(generations=5, population_size=20, cv=5, verbosity=2)
    teapot.fit(cache.get(stocks[0])[0], cache.get(stocks[0])[1])
    teapot.export('models/tpot_model_' + time + '.py')

In [14]:
# import inspect

# if not hasattr(inspect, 'getargspec'):
#     inspect.getargspec = inspect.getfullargspec

stocks = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA', 'NVDA', 'XOM', 'META', 'JNJ', 'JPM']
train_model(stocks, 10)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


AttributeError: module 'inspect' has no attribute 'getargspec'

In [31]:
model_7 = load_model('models/model_7')
model_30 = load_model('models/model_30')
model_90 = load_model('models/model_90')
models = {'model_7': model_7, 'model_30': model_30, 'model_90': model_90}

days = 9000

In [32]:
stocks = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA', 'NVDA', 'XOM', 'META', 'JNJ', 'JPM'] 
pipe = Pipeline([('Dropper', FeatureDropper()), ('Scaler', FeatureScaler())])
cache = {}
inverters = {}
end = date.today()
start = end - timedelta(days=days)
yf.pdr_override()

for stock in stocks:

    data = yf.download(stock, start, end)
    data = data.resample('D').first() 
    data = data.dropna(how='any', axis='rows')
    frame, open, high, low, close = pipe.fit_transform(data)
    inverters[stock] = [open, high, low, close]
    X_7, y_7, future_X_7 = create_dataset(frame, 7, 5)
    X_30, y_30, future_X_30 = create_dataset(frame, 30, 5)
    X_90, y_90, future_X_90 = create_dataset(frame, 90, 5)
    cache[stock] = {'data':data, 'model_7':future_X_7, 'model_30':future_X_30, 'model_90':future_X_90}

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [72]:
def prediction(pred, period):

        if period == 'model_7':
                window = 7
        elif period == 'model_30':
                window = 30
        else:
                window = 90

        model = models.get(period)
        preds = model.predict(cache.get(stock).get(period))
        outputs = inverters.get(stock)[pred].inverse_transform(preds[:,pred].reshape(-1, 1))
        dates = list(cache.get(stock).get('data').index)
        last = dates[-1]
        for i in range(window):
                dates.append(last + pd.DateOffset(days=i+1))
        dates = dates[window+5:]
        output = pd.DataFrame(data={'Days':pd.Series(dates), 'Value':pd.Series(outputs.flatten())})
        return output




SyntaxError: 'return' outside function (14864263.py, line 20)

In [92]:
set_verbosity()
pipe = Pipeline([('Dropper', FeatureDropper()), ('Scaler', FeatureScaler())])
random.shuffle(stocks)
cache = {}

frame = load_frame(timeframe, 'AAPL')

pred = pd.read_csv(r"C:\Users\gauld\Documents\4th year\Stock Predic\stocks_csvs\model_7_AAPL.csv")


[*********************100%***********************]  1 of 1 completed


In [98]:
pred

Unnamed: 0,Dates,Open,High,Low,Close
0,1998-07-22,1.061040,0.621303,1.101952,0.858052
1,1998-07-23,1.065924,0.626505,1.106852,0.863169
2,1998-07-24,1.074071,0.634894,1.114999,0.871569
3,1998-07-27,1.071602,0.631999,1.112360,0.868777
4,1998-07-28,1.080270,0.640742,1.120905,0.877283
...,...,...,...,...,...
6189,2023-02-24,148.953160,149.333560,146.973900,148.287020
6190,2023-02-25,147.124600,147.406450,145.119810,146.410140
6191,2023-02-26,147.456450,147.681370,145.412610,146.655150
6192,2023-02-27,148.930250,149.150860,146.878620,148.086070


In [117]:
partition = 70

fig = go.Figure(go.Scatter(x=frame.index[-partition:], y=frame['Open'][-partition:], line_color='red', name='Actual'))
fig.add_trace(go.Scatter(x=pred['Dates'][-partition:], y=pred['Open'][-partition:], line_color='cyan', name='predicted'))
fig.update_layout(template='plotly_dark', plot_bgcolor='rgba(0,0,0,0)', title="Plot Title",
    xaxis_title="X Axis Title", yaxis_title="Y Axis Title", paper_bgcolor='rgba(0,0,0,0)')
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [122]:
fig = go.Figure(go.Candlestick(x=frame.index,
                open=frame['Open'], 
                high=frame['High'],
                low=frame['Low'], 
                close=frame['Close']))
fig.add_trace(go.Scatter(x=pred['Dates'], y=pred['Close'], line_color='cyan', name='predicted'))
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed