In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt

from datetime import date, datetime, timedelta

In [4]:
df_stock_ta = pd.read_csv('temp/stock_ta.csv')
df_stock_cdl = pd.read_csv('temp/stock_cdl.csv')
df_ind_ta = pd.read_csv('temp/ind_ta.csv')
df_ind_cdl = pd.read_csv('temp/ind_cdl.csv')
df_stock_news = pd.read_csv('temp/stock_news.csv')
df_ind_news = pd.read_csv('temp/ind_news.csv')

df_tar = pd.read_csv('temp/target.csv')
df_tar = df_tar[['report_date', 'ticker', 'label']]
# df_stock_ta = df_stock_ta.drop(['open', 'high', 'low', 'close', 'volume', 'adjusted_close'], axis = 1)
# df_ind_ta = df_ind_ta.drop(['open', 'high', 'low', 'close', 'volume', 'adjusted_close'], axis = 1)

# Prepare Cross Industrial Feature
df_inds = df_ind_cdl.merge(df_ind_news, on = ['inds', 'report_date'], how = 'left')
df_inds = df_inds.merge(df_ind_ta, on = ['report_date', 'inds'])
df_inds = df_inds.fillna(method = 'ffill')

# LSTM prediction

In [5]:
period = 7; train_size = 0.8; prepare_3d = False
require_features = ['cdl', 'ta', 'news']
inds_num = len(df_inds['inds'].unique())

price_cols = ['open', 'high', 'low', 'close', 'volume']
cdl_cols = ['morning_star', 'evening_star', 'hammer',
       'inverted_hammer', 'bullish_engulfing', 'bearish_engulfing',
       'shooting_star', 'hanging_man']
news_cols = ['compound', 'neg', 'neu', 'pos', 'new_cases',
       'total_cases', 'total_deaths', 'new_deaths', 'total_deaths_nd']
ta_cols = df_stock_ta.drop(['report_date', 'ticker'], axis = 1).columns

In [None]:
# Prepare Train test dataset preparation
X_train, y_train, X_test, y_test = [], [], [], []

stocks = df_tar['ticker'].unique()
for stock in stocks:
    
    print('Current Stock: ', stock)
    temp_stock_cdl = df_stock_cdl[df_stock_cdl['ticker'] == stock]
    temp_stock_ta = df_stock_ta[df_stock_ta['ticker'] == stock]
    temp_stock_news = df_stock_news[df_stock_news['ticker'] == stock]
    temp_stock_tar = df_tar[df_tar['ticker'] == stock]
    
    # Merge results
    data = temp_stock_tar.merge(temp_stock_cdl, on = ['report_date', 'ticker'])
    data = data.merge(temp_stock_ta, on = ['report_date', 'ticker'])
    data = data.merge(temp_stock_news, on = ['report_date', 'ticker'], how = 'left')
    data = data.fillna(method = 'ffill')
    
    check = True
    for i in range(period, len(data)):
        
        temp = []
        # Create historical data (within period)
        temp_period = data.iloc[i - period:i,]
        if temp_period.isnull().sum().sum() > 0:
            continue

        # Create stock feature space
        feature_dfs = []
        feature_dfs.append(temp_period[price_cols])
        
        if 'cdl' in require_features:
            feature_dfs.append(temp_period[cdl_cols])
            
        if 'news' in require_features:
            feature_dfs.append(temp_period[news_cols])
            
        if 'ta' in require_features:
            feature_dfs.append(temp_period[ta_cols])
            
        temp_stock_feature = pd.concat(feature_dfs, axis = 1).to_numpy()
        temp.append(temp_stock_feature)
        
        # Get industrial feature space
        for ind in sorted(list(df_inds['inds'].unique())):
            temp_period_inds = df_inds[df_inds['inds'] == ind].reset_index(drop = True).iloc[i - period:i,]
            
            feature_ind_dfs = []
            feature_ind_dfs.append(temp_period_inds[price_cols])
            if 'cdl' in require_features:
                feature_ind_dfs.append(temp_period_inds[cdl_cols])

            if 'news' in require_features:
                feature_ind_dfs.append(temp_period_inds[news_cols])

            if 'ta' in require_features:
                feature_ind_dfs.append(temp_period_inds[ta_cols])
                
            temp_ind_feature = pd.concat(feature_ind_dfs, axis = 1).to_numpy()
            temp.append(temp_ind_feature)
        
        # Create label 
        y_label = temp_period.iloc[-1]['label']
        
        if i <= int((len(data) - period) * train_size):
            
#             if check:
#                 print('train', i)
#                 print('max report_date: ', max(temp_period['report_date']), 'min report_date: ', min(temp_period['report_date']))
#                 print('Whole feature shape: ', np.array(temp).shape)
#                 print('label: ', y_label)
#                 check = False

            X_train.append(temp)
            y_train.append(y_label)
        else: 
            X_test.append(temp)
            y_test.append(y_label)
        
X_train, y_train = np.array(X_train).astype('float32'), np.array(y_train)
X_test, y_test = np.array(X_test).astype('float32'), np.array(y_test)

Current Stock:  MMM
Current Stock:  AOS
Current Stock:  ABT
Current Stock:  ABBV
Current Stock:  ABMD
Current Stock:  ACN
Current Stock:  ATVI
Current Stock:  ADM
Current Stock:  ADBE
Current Stock:  ADP
Current Stock:  AAP
Current Stock:  AES
Current Stock:  AFL
Current Stock:  A
Current Stock:  APD
Current Stock:  AKAM
Current Stock:  ALK
Current Stock:  ALB
Current Stock:  ARE
Current Stock:  ALGN
Current Stock:  ALLE
Current Stock:  LNT
Current Stock:  ALL
Current Stock:  GOOGL
Current Stock:  GOOG
Current Stock:  MO
Current Stock:  AMZN
Current Stock:  AMCR
Current Stock:  AMD
Current Stock:  AEE
Current Stock:  AAL
Current Stock:  AEP
Current Stock:  AXP
Current Stock:  AIG
Current Stock:  AMT
Current Stock:  AWK
Current Stock:  AMP
Current Stock:  ABC
Current Stock:  AME
Current Stock:  AMGN
Current Stock:  APH
Current Stock:  ADI
Current Stock:  ANSS
Current Stock:  AON
Current Stock:  APA
Current Stock:  AAPL
Current Stock:  AMAT
Current Stock:  APTV
Current Stock:  ANET
Curre

Current Stock:  RCL
Current Stock:  SPGI
Current Stock:  CRM
Current Stock:  SBAC
Current Stock:  SLB
Current Stock:  STX
Current Stock:  SEE
Current Stock:  SRE
Current Stock:  NOW
Current Stock:  SHW
Current Stock:  SBNY
Current Stock:  SPG
Current Stock:  SWKS
Current Stock:  SJM
Current Stock:  SNA
Current Stock:  SEDG
Current Stock:  SO
Current Stock:  LUV
Current Stock:  SWK
Current Stock:  SBUX
Current Stock:  STT
Current Stock:  STE
Current Stock:  SYK
Current Stock:  SIVB
Current Stock:  SYF
Current Stock:  SNPS
Current Stock:  SYY
Current Stock:  TMUS
Current Stock:  TROW
Current Stock:  TTWO
Current Stock:  TPR
Current Stock:  TGT


In [None]:
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

In [None]:
# Divide to multiple combinations
cond_dict = {}
feature_idx_dict = {name: ind for ind, name in enumerate(pd.concat(feature_dfs, axis = 1).columns)}
adj_news_cols = ['compound', 'pos', 'new_deaths', 'new_deaths_nd']

# Candlestick + TA + News
feature_idx = [idx for name, idx in feature_idx_dict.items() \
                   if name in price_cols + cdl_cols + adj_news_cols + list(ta_cols)]
cond_dict['All'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick + News
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols + adj_news_cols]
cond_dict['Candlestick+News'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick + TA
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols + list(ta_cols)]
cond_dict['Candlestick+TA'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick Only
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols]
cond_dict['CandlestickOnly'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

## LSTM

In [27]:
def lstm(input_shape):
    
    # LSTM model
    model = models.Sequential()
    model.add(layers.LSTM(50, activation = 'relu', input_shape = input_shape))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [28]:
for feature_type, data in cond_dict.items():

    print(f'---- {feature_type} ----')
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], period, -1)
        temp_test = temp_test.reshape(temp_test.shape[0], period, -1)

    model = lstm((temp_train.shape[1], temp_train.shape[2]))
    history = model.fit(temp_train, y_train, epochs=4, validation_split=0.2)

    print('Training Accuracy')
    print(model.evaluate(temp_train, y_train))

    print('Testing Accuracy')
    print(model.evaluate(temp_test, y_test))

---- All ----
Epoch 1/4


2022-11-07 14:59:47.519982: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-07 15:02:25.570370: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.6084175705909729, 0.6754153966903687]
Testing Accuracy
[1.2247940301895142, 0.4569525718688965]
---- Candlestick+News ----
Epoch 1/4


2022-11-07 15:21:51.576969: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-07 15:27:54.683929: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.6149066686630249, 0.6621825695037842]
Testing Accuracy
[1.127211332321167, 0.3932639956474304]
---- Candlestick+TA ----
Epoch 1/4


2022-11-07 15:49:01.520396: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-07 15:53:07.365478: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.568978488445282, 0.7044143080711365]
Testing Accuracy
[0.9519954323768616, 0.4995999038219452]
---- CandlestickOnly ----
Epoch 1/4


2022-11-07 16:12:05.435772: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-07 16:16:44.303175: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.5042543411254883, 0.7499565482139587]
Testing Accuracy
[0.974292516708374, 0.5430710315704346]


## Simple RNN

In [2]:
# Simple RNN
def simpleRNN(input_shape):
    
    # LSTM model
    model = models.Sequential()
    model.add(layers.SimpleRNN(50, input_shape = input_shape, activation = 'relu'))

    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = simpleRNN((temp_train.shape[1], temp_train.shape[2]))
model.summary()

NameError: name 'temp_train' is not defined

In [30]:
# Simple RNN
def simpleRNN(input_shape):
    
    # LSTM model
    model = models.Sequential()
    model.add(layers.SimpleRNN(50, input_shape = input_shape, activation = 'relu'))

    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

prepare_3d = False
for feature_type, data in cond_dict.items():

    print(f'---- {feature_type} ----')
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], period, -1)
        temp_test = temp_test.reshape(temp_test.shape[0], period, -1)

    model = simpleRNN((temp_train.shape[1], temp_train.shape[2]))
    history = model.fit(temp_train, y_train, epochs=5)

    print('Training Accuracy')
    print(model.evaluate(temp_train, y_train))

    print('Testing Accuracy')
    print(model.evaluate(temp_test, y_test))