In [1]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import yfinance as yf
import numpy as np
from functools import reduce

## Stock Feature Selection: Adj Close, Volume, RSI, ATR

In [6]:
tickers = pd.read_csv('tickers.csv')
tickers.sort_values(by='Market Cap', ascending=False, inplace=True)
top18_tickers = tickers[:20]
top18_tickers.reset_index(inplace=True)
top18_tickers = top18_tickers.drop([7,8])
stocks = top18_tickers['Symbol'].to_list()
start_date = '2013-01-01'
end_date = '2019-01-01'
data = yf.download("AAPL", start=start_date, end=end_date)
data = data.reset_index()
dates = data['Date']

[*********************100%%**********************]  1 of 1 completed


In [97]:
def get_log_returns(list_of_stocks_tickers, start_date, end_date, interval='1d'):
    stocks = list()
    for ticker in list_of_stocks_tickers:
        data = yf.download(ticker, start='2000-01-01', end=end_date, interval=interval)
        data = {ticker: np.log(data['Adj Close']) - np.log(data['Adj Close'].shift(1))}
        log_return = pd.DataFrame(data=data)
        stocks.append(log_return)
        break
    all_stocks = reduce(lambda df1, df2: pd.merge(df1, df2, on='Date'), stocks)
    all_stocks['Date'] = log_return.index
    return all_stocks

In [98]:
returns = get_log_returns(stocks, start_date, end_date)

[*********************100%%**********************]  1 of 1 completed


In [38]:
def calculate_rsi(prices, n=14):
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed >= 0].sum()/n
    down = -seed[seed < 0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1]  # The diff is 1 shorter

        if delta > 0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

In [53]:
def calculate_rsi(data, period=14):
    # Calculate price differences
    price_diff = np.diff(data)
    
    # Calculate initial seed values
    seed = price_diff[:period + 1]
    positive_seed = seed[seed >= 0].sum()
    negative_seed = -seed[seed < 0].sum()
    
    # Calculate initial RS and RSI
    initial_rs = positive_seed / negative_seed
    initial_rsi = 100 - 100 / (1 + initial_rs)
    
    # Initialize up and down values
    up = positive_seed / period
    down = negative_seed / period
    
    # Initialize RSI array
    rsi = np.zeros_like(data)
    rsi[:period] = initial_rsi

    for i in range(period, len(data)):
        # Calculate delta for the current period
        delta = price_diff[i - 1]
        
        if delta > 0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta
        
        # Update up and down values
        up = (up * (period - 1) + upval) / period
        down = (down * (period - 1) + downval) / period
        
        # Calculate RS and RSI for the current period
        current_rs = up / down
        rsi[i] = 100 - 100 / (1 + current_rs)

    return rsi


In [54]:
def calculate_atr(data, period=14):
    data = np.array(data)
    
    high = data[:, 0]
    low = data[:, 1]
    close = data[:, 2]
    
    tr = np.maximum(high - low, np.abs(high - np.roll(close, 1)), np.abs(low - np.roll(close, 1)))
    
    atr = np.zeros_like(tr)
    atr[period] = np.mean(tr[:period])
    
    for i in range(period + 1, len(tr)):
        atr[i] = ((period - 1) * atr[i - 1] + tr[i]) / period
    
    return atr

## Model Param

In [55]:
params = {
    "BATCH_SIZE": 50,
    "EPOCHS": 10,
    "LR": 0.00010000,
    "TIME_STEPS": 60
    }

TIME_STEPS = params['TIME_STEPS']
BATCH_SIZE = params['BATCH_SIZE']


def build_timeseries(mat, y_col_index):
    
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]

    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))

    print("Length of inputs", dim_0)

    for i in range(dim_0):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]

    print("length of time-series - inputs", x.shape)
    print("length of time-series - outputs", y.shape)


    return x, y
     

## Model

In [82]:
pip install keras_tuner

Collecting keras_tuner
  Obtaining dependency information for keras_tuner from https://files.pythonhosted.org/packages/11/f0/099faf9285ec8ac5acb9296ce8c55bce2ad4c6af14b3830f7157fe69128d/keras_tuner-1.4.0-py3-none-any.whl.metadata
  Downloading keras_tuner-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting keras-core (from keras_tuner)
  Obtaining dependency information for keras-core from https://files.pythonhosted.org/packages/95/f7/b8dcff937ea64f822f0d3fe8c6010793406b82d14467cd0e9eecea458a40/keras_core-0.1.7-py3-none-any.whl.metadata
  Downloading keras_core-0.1.7-py3-none-any.whl.metadata (4.3 kB)
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting rich (from keras-core->keras_tuner)
  Obtaining dependency information for rich from https://files.pythonhosted.org/packages/c1/d1/23ba6235ed82883bb416f57179d1db2c05f3fb8e5d83c18660f9ab6f09c9/rich-13.5.3-py3-none-any.whl.metadata
  Downloading rich-13.5.3-py3-none-any.whl.metadata (1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.


In [132]:
from sklearn.preprocessing import MinMaxScaler
import keras
import tensorflow as tf
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Input, Activation,concatenate, Attention, Bidirectional,GlobalAveragePooling1D
from keras import optimizers
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import LeakyReLU
import matplotlib.pyplot as plt
import keras_tuner as kt
plt.style.use('fivethirtyeight')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [143]:
def model_builder(hyperparameters):
    lstm_model = Sequential()
    
    hyperparameters_units = hyperparameters.Int('units', min_value=50, max_value=100, step=TIME_STEPS)
    lstm_model.add(Bidirectional(LSTM(units=hyperparameters_units, 
                        input_shape=(x_t.shape[1], x_t.shape[2]), 
                        return_sequences=True,
                        kernel_initializer='he_normal')))
    lstm_model.add(GlobalAveragePooling1D())
    lstm_model.add(Dense(60,activation='relu'))
    lstm_model.add(Dense(20,activation='relu'))
    lstm_model.add(Dropout(0.05))
    lstm_model.add(Dense(1))
    
    hyperparameters_learning_rate = hyperparameters.Choice('learning_rate', values=[0.01, 0.05, 0.1])
    
    lstm_model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam(learning_rate=hyperparameters_learning_rate))
    
    return lstm_model

In [142]:
mse = {}
total_predicted_returns = pd.DataFrame()
for ticker in stocks:
    df = yf.download(ticker, start='2000-01-01', end=end_date, interval='1d')
    df['RSI'] = calculate_rsi(df['Adj Close'])
    df['ATR'] = calculate_atr(df[['High', 'Low', 'Close']])
    stock_returns = returns[[ticker]].reset_index()
    stock_returns['Date'] = stock_returns['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
    df.reset_index(inplace=True)
    df_stock = df[['RSI', 'ATR', 'Date', 'Volume']]
    stock_returns.Date = pd.to_datetime(stock_returns.Date)
    df_stock.Date = pd.to_datetime(df_stock.Date)
    data = pd.merge(df_stock[['RSI', 'ATR', 'Date', 'Volume']],  stock_returns , on='Date')[1:].rename(columns={ticker : 'Returns'})
    print(data)
    # Split the data we try by years first
    data = data[data['Date'] < '2019-01-01']
    df_train = data[data['Date'] < '2018-01-01']
    predicted_period = data[(data['Date'] >= '2018-01-01') & (data['Date'] < '2019-01-01')][['Date']]
    print(len(data) - len(predicted_period) - 60)
    df_test = data[len(data) - len(predicted_period) - 60:]
    train_cols = ["RSI", "ATR", "Volume", "Returns"]
    x = df_train[train_cols].values
    print(x)
    
    #scaling
    min_max_scaler = MinMaxScaler(feature_range = (0, 1))
    x_train = min_max_scaler.fit_transform(x)
    x_test = min_max_scaler.transform(df_test[train_cols])
    
    x_t, y_t = build_timeseries(x_train, 1)
    print("Training Size", x_t.shape, y_t.shape)
    
    x_t_test, y_t_test = build_timeseries(x_test, 1)
    print("Test Size", x_t_test.shape, y_t_test.shape)
    
    x_left, x_val = train_test_split(x_t, test_size=0.2, shuffle=False)
    y_left, y_val = train_test_split(y_t, test_size=0.2, shuffle=False)
    
    tuner = kt.BayesianOptimization(
        model_builder,
        objective='val_loss',
        max_trials=5)
    tuner.search(x_t, y_t, epochs=5, validation_data=(x_val,y_val))
    lstm_model  = tuner.get_best_models()[0]  
    print(ticker,lstm_model.summary())
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    #Model Training
    history_lstm = lstm_model.fit(x_t, y_t, epochs=params["EPOCHS"], verbose=1, batch_size=BATCH_SIZE, callbacks=[callback],
                                shuffle=False)
    y_pred_lstm = lstm_model.predict(x_t_test, batch_size=BATCH_SIZE)
    print(y_pred_lstm)
    y_pred_lstm = y_pred_lstm.flatten()
    print(y_pred_lstm)
    error_lstm = mean_squared_error(y_t_test, y_pred_lstm)
    mse[ticker] = error_lstm
    print(ticker,error_lstm)
    
    y_pred_lstm_org = (y_pred_lstm * min_max_scaler.data_range_[1]) + min_max_scaler.data_min_[1]   #Inverse Transform 
    predicted_returns = pd.Series(y_pred_lstm_org).to_frame(ticker)
    predicted_period = predicted_period.reset_index().drop(columns=['index'])
    result = predicted_returns.join(predicted_period)
    if  total_predicted_returns.empty:
        total_predicted_returns= result 
    else: 
        total_predicted_returns = pd.merge(result, total_predicted_returns, on='Date')
    break

[*********************100%%**********************]  1 of 1 completed
            RSI       ATR       Date     Volume   Returns
1     50.204728  0.000000 2000-01-04  512377600 -0.088077
2     50.204728  0.000000 2000-01-05  778321600  0.014527
3     50.204728  0.000000 2000-01-06  767972800 -0.090514
4     50.204728  0.000000 2000-01-07  460734400  0.046281
5     50.204728  0.000000 2000-01-10  505064000 -0.017744
...         ...       ...        ...        ...       ...
4774  22.983531  1.395621 2018-12-24  148676800 -0.026215
4775  36.492534  1.483613 2018-12-26  234330000  0.068052
4776  35.824970  1.497283 2018-12-27  212468400 -0.006511
4777  35.923979  1.461227 2018-12-28  169165600  0.000512
4778  37.872295  1.412747 2018-12-31  140014000  0.009619

[4778 rows x 5 columns]
4467
[[ 5.02047281e+01  0.00000000e+00  5.12377600e+08 -8.80772597e-02]
 [ 5.02047281e+01  0.00000000e+00  7.78321600e+08  1.45269400e-02]
 [ 5.02047281e+01  0.00000000e+00  7.67972800e+08 -9.05139454e-02]
 ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stock.Date = pd.to_datetime(df_stock.Date)




ValueError: Received incompatible tensor with shape (20,) when attempting to restore variable with shape (1,) and name dense_1/bias:0.

In [135]:
y_pred_lstm

array([0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346475,
       0.18346475, 0.18346475, 0.18346475, 0.18346475, 0.18346

In [80]:
data

Unnamed: 0,RSI,ATR,Date,Volume,Returns
1,76.776775,0.000000,2013-01-03,9598300,-0.001413
2,76.776775,0.000000,2013-01-04,11631800,0.011385
3,76.776775,0.000000,2013-01-07,7548800,-0.002099
4,76.776775,0.000000,2013-01-08,9825300,0.000140
5,76.776775,0.000000,2013-01-09,7672800,0.004471
...,...,...,...,...,...
1505,22.357953,3.818837,2018-12-24,7531900,-0.041851
1506,32.109730,3.885349,2018-12-26,9253000,0.031018
1507,33.731023,3.917824,2018-12-27,9918700,0.005509
1508,33.558416,3.780837,2018-12-28,6537200,-0.001099


In [76]:
stock_returns

Unnamed: 0,Date,AAPL
0,2013-01-02,
1,2013-01-03,-0.012703
2,2013-01-04,-0.028249
3,2013-01-07,-0.005900
4,2013-01-08,0.002687
...,...,...
1505,2018-12-24,-0.026215
1506,2018-12-26,0.068052
1507,2018-12-27,-0.006511
1508,2018-12-28,0.000512


In [62]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,RSI,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-01-02,19.779285,19.821428,19.343929,19.608213,16.791183,560518000,23.009925,0.000000
2013-01-03,19.567142,19.631071,19.321428,19.360714,16.579241,352965200,23.009925,0.000000
2013-01-04,19.177500,19.236786,18.779642,18.821428,16.117434,594333600,23.009925,0.000000
2013-01-07,18.642857,18.903570,18.400000,18.710714,16.022627,484156400,23.009925,0.000000
2013-01-08,18.900356,18.996071,18.616072,18.761070,16.065748,458707200,23.009925,0.000000
...,...,...,...,...,...,...,...,...
2018-12-24,37.037498,37.887501,36.647499,36.707500,35.278675,148676800,22.983636,1.395621
2018-12-26,37.075001,39.307499,36.680000,39.292500,37.763062,234330000,36.492623,1.483613
2018-12-27,38.959999,39.192501,37.517502,39.037498,37.517986,212468400,35.825061,1.497283
2018-12-28,39.375000,39.630001,38.637501,39.057499,37.537209,169165600,35.924069,1.461227
