In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt

from datetime import date, datetime, timedelta

In [2]:
df_stock_ta = pd.read_csv('temp/stock_ta.csv')
df_stock_cdl = pd.read_csv('temp/stock_cdl.csv')
df_ind_ta = pd.read_csv('temp/ind_ta.csv')
df_ind_cdl = pd.read_csv('temp/ind_cdl.csv')
df_stock_news = pd.read_csv('temp/stock_news.csv')
df_ind_news = pd.read_csv('temp/ind_news.csv')

df_tar = pd.read_csv('temp/target.csv')
df_tar = df_tar[['report_date', 'ticker', 'label']]
# df_stock_ta = df_stock_ta.drop(['open', 'high', 'low', 'close', 'volume', 'adjusted_close'], axis = 1)
# df_ind_ta = df_ind_ta.drop(['open', 'high', 'low', 'close', 'volume', 'adjusted_close'], axis = 1)

# Prepare Cross Industrial Feature
df_inds = df_ind_cdl.merge(df_ind_news, on = ['inds', 'report_date'], how = 'left')
df_inds = df_inds.merge(df_ind_ta, on = ['report_date', 'inds'])
df_inds = df_inds.fillna(method = 'ffill')

In [3]:
period = 7; train_size = 0.8; prepare_3d = False
require_features = ['cdl', 'ta', 'news']
inds_num = len(df_inds['inds'].unique())

price_cols = ['open', 'high', 'low', 'close', 'volume']
cdl_cols = ['morning_star', 'evening_star', 'hammer',
       'inverted_hammer', 'bullish_engulfing', 'bearish_engulfing',
       'shooting_star', 'hanging_man']
news_cols = ['compound', 'neg', 'neu', 'pos', 'new_cases',
       'total_cases', 'total_deaths', 'new_deaths', 'total_deaths_nd']
ta_cols = df_stock_ta.drop(['report_date', 'ticker'], axis = 1).columns

In [4]:
# Prepare Train test dataset preparation
X_train, y_train, X_test, y_test = [], [], [], []

stocks = df_tar['ticker'].unique()
for stock in stocks:
    
    print('Current Stock: ', stock)
    temp_stock_cdl = df_stock_cdl[df_stock_cdl['ticker'] == stock]
    temp_stock_ta = df_stock_ta[df_stock_ta['ticker'] == stock]
    temp_stock_news = df_stock_news[df_stock_news['ticker'] == stock]
    temp_stock_tar = df_tar[df_tar['ticker'] == stock]
    
    # Merge results
    data = temp_stock_tar.merge(temp_stock_cdl, on = ['report_date', 'ticker'])
    data = data.merge(temp_stock_ta, on = ['report_date', 'ticker'])
    data = data.merge(temp_stock_news, on = ['report_date', 'ticker'], how = 'left')
    data = data.fillna(method = 'ffill')
    
    check = True
    for i in range(period, len(data)):
        
        temp = []
        # Create historical data (within period)
        temp_period = data.iloc[i - period:i,]
        if temp_period.isnull().sum().sum() > 0:
            continue

        # Create stock feature space
        feature_dfs = []
        feature_dfs.append(temp_period[price_cols])
        
        if 'cdl' in require_features:
            feature_dfs.append(temp_period[cdl_cols])
            
        if 'news' in require_features:
            feature_dfs.append(temp_period[news_cols])
            
        if 'ta' in require_features:
            feature_dfs.append(temp_period[ta_cols])
            
        temp_stock_feature = pd.concat(feature_dfs, axis = 1).to_numpy()
        temp.append(temp_stock_feature)
        
        # Get industrial feature space
        for ind in sorted(list(df_inds['inds'].unique())):
            temp_period_inds = df_inds[df_inds['inds'] == ind].reset_index(drop = True).iloc[i - period:i,]
            
            feature_ind_dfs = []
            feature_ind_dfs.append(temp_period_inds[price_cols])
            if 'cdl' in require_features:
                feature_ind_dfs.append(temp_period_inds[cdl_cols])

            if 'news' in require_features:
                feature_ind_dfs.append(temp_period_inds[news_cols])

            if 'ta' in require_features:
                feature_ind_dfs.append(temp_period_inds[ta_cols])
                
            temp_ind_feature = pd.concat(feature_ind_dfs, axis = 1).to_numpy()
            temp.append(temp_ind_feature)
        
        # Create label 
        y_label = temp_period.iloc[-1]['label']
        
        if i <= int((len(data) - period) * train_size):
            
#             if check:
#                 print('train', i)
#                 print('max report_date: ', max(temp_period['report_date']), 'min report_date: ', min(temp_period['report_date']))
#                 print('Whole feature shape: ', np.array(temp).shape)
#                 print('label: ', y_label)
#                 check = False

            X_train.append(temp)
            y_train.append(y_label)
        else: 
            X_test.append(temp)
            y_test.append(y_label)
        
X_train, y_train = np.array(X_train).astype('float32'), np.array(y_train)
X_test, y_test = np.array(X_test).astype('float32'), np.array(y_test)

Current Stock:  MMM
Current Stock:  AOS
Current Stock:  ABT
Current Stock:  ABBV
Current Stock:  ABMD
Current Stock:  ACN
Current Stock:  ATVI
Current Stock:  ADM
Current Stock:  ADBE
Current Stock:  ADP
Current Stock:  AAP
Current Stock:  AES
Current Stock:  AFL
Current Stock:  A
Current Stock:  APD
Current Stock:  AKAM
Current Stock:  ALK
Current Stock:  ALB
Current Stock:  ARE
Current Stock:  ALGN
Current Stock:  ALLE
Current Stock:  LNT
Current Stock:  ALL
Current Stock:  GOOGL
Current Stock:  GOOG
Current Stock:  MO
Current Stock:  AMZN
Current Stock:  AMCR
Current Stock:  AMD
Current Stock:  AEE
Current Stock:  AAL
Current Stock:  AEP
Current Stock:  AXP
Current Stock:  AIG
Current Stock:  AMT
Current Stock:  AWK
Current Stock:  AMP
Current Stock:  ABC
Current Stock:  AME
Current Stock:  AMGN
Current Stock:  APH
Current Stock:  ADI
Current Stock:  ANSS
Current Stock:  AON
Current Stock:  APA
Current Stock:  AAPL
Current Stock:  AMAT
Current Stock:  APTV
Current Stock:  ANET
Curre

Current Stock:  RCL
Current Stock:  SPGI
Current Stock:  CRM
Current Stock:  SBAC
Current Stock:  SLB
Current Stock:  STX
Current Stock:  SEE
Current Stock:  SRE
Current Stock:  NOW
Current Stock:  SHW
Current Stock:  SBNY
Current Stock:  SPG
Current Stock:  SWKS
Current Stock:  SJM
Current Stock:  SNA
Current Stock:  SEDG
Current Stock:  SO
Current Stock:  LUV
Current Stock:  SWK
Current Stock:  SBUX
Current Stock:  STT
Current Stock:  STE
Current Stock:  SYK
Current Stock:  SIVB
Current Stock:  SYF
Current Stock:  SNPS
Current Stock:  SYY
Current Stock:  TMUS
Current Stock:  TROW
Current Stock:  TTWO
Current Stock:  TPR
Current Stock:  TGT
Current Stock:  TEL
Current Stock:  TDY
Current Stock:  TFX
Current Stock:  TER
Current Stock:  TSLA
Current Stock:  TXN
Current Stock:  TXT
Current Stock:  TMO
Current Stock:  TJX
Current Stock:  TSCO
Current Stock:  TT
Current Stock:  TDG
Current Stock:  TRV
Current Stock:  TRMB
Current Stock:  TFC
Current Stock:  TWTR
Current Stock:  TYL
Current

In [5]:
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

X_train shape:  (109349, 12, 7, 38)
y_train shape:  (109349,)
X_test shape:  (42488, 12, 7, 38)
y_test shape:  (42488,)


In [6]:
# Divide to multiple combinations
cond_dict = {}
feature_idx_dict = {name: ind for ind, name in enumerate(pd.concat(feature_dfs, axis = 1).columns)}
adj_news_cols = ['compound', 'pos', 'new_deaths', 'new_deaths_nd']

# Candlestick + TA + News
feature_idx = [idx for name, idx in feature_idx_dict.items() \
                   if name in price_cols + cdl_cols + adj_news_cols + list(ta_cols)]
cond_dict['All'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick + News
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols + adj_news_cols]
cond_dict['Candlestick+News'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick + TA
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols + list(ta_cols)]
cond_dict['Candlestick+TA'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick Only
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols]
cond_dict['CandlestickOnly'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

## LSTM

In [13]:
def lstm(input_shape):
    
    # LSTM model
    model = models.Sequential()
    model.add(layers.LSTM(50, activation = 'relu', input_shape = input_shape))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = lstm((temp_train.shape[1], temp_train.shape[2]))
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 50)                41400     
                                                                 
 dense_9 (Dense)             (None, 1)                 51        
                                                                 
Total params: 41,451
Trainable params: 41,451
Non-trainable params: 0
_________________________________________________________________


In [8]:
for feature_type, data in cond_dict.items():

    print(f'---- {feature_type} ----')
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], period, -1)
        temp_test = temp_test.reshape(temp_test.shape[0], period, -1)

    model = lstm((temp_train.shape[1], temp_train.shape[2]))
    history = model.fit(temp_train, y_train, epochs=4, validation_split=0.2)

    print('Training Accuracy')
    print(model.evaluate(temp_train, y_train))

    print('Testing Accuracy')
    print(model.evaluate(temp_test, y_test))

---- All ----
Metal device set to: Apple M1 Pro


2022-11-14 17:39:00.162826: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-14 17:39:00.163847: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/4


2022-11-14 17:39:01.727953: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-14 17:39:02.396735: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-14 17:41:40.233341: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.5767002701759338, 0.7019909024238586]
Testing Accuracy
[0.9563307166099548, 0.48361894488334656]
---- Candlestick+News ----
Epoch 1/4


2022-11-14 17:50:45.127483: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-14 17:53:33.739140: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.5040032863616943, 0.750505268573761]
Testing Accuracy
[1.5789481401443481, 0.4912681579589844]
---- Candlestick+TA ----
Epoch 1/4


2022-11-14 18:03:04.685664: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-14 18:05:32.896665: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.5862230062484741, 0.693723738193512]
Testing Accuracy
[0.8096153140068054, 0.5195349454879761]
---- CandlestickOnly ----
Epoch 1/4


2022-11-14 18:14:38.747588: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-11-14 18:17:18.776678: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Training Accuracy
[0.5052719116210938, 0.7500114440917969]
Testing Accuracy
[1.7391356229782104, 0.49738752841949463]


## Simple RNN

In [9]:
# Simple RNN
def simpleRNN(input_shape):
    
    # LSTM model
    model = models.Sequential()
    model.add(layers.SimpleRNN(50, input_shape = input_shape, activation = 'relu'))

    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = simpleRNN((temp_train.shape[1], temp_train.shape[2]))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 50)                10350     
                                                                 
 dense_4 (Dense)             (None, 1)                 51        
                                                                 
Total params: 10,401
Trainable params: 10,401
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.get_config()

{'name': 'sequential_8',
 'layers': [{'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 7, 156),
    'dtype': 'float32',
    'sparse': False,
    'ragged': False,
    'name': 'simple_rnn_4_input'}},
  {'class_name': 'SimpleRNN',
   'config': {'name': 'simple_rnn_4',
    'trainable': True,
    'batch_input_shape': (None, 7, 156),
    'dtype': 'float32',
    'return_sequences': False,
    'return_state': False,
    'go_backwards': False,
    'stateful': False,
    'unroll': False,
    'time_major': False,
    'units': 50,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'recurrent_initializer': {'class_name': 'Orthogonal',
     'config': {'gain': 1.0, 'seed': None}},
    'bias_initializer': {'class_name': 'Zeros', 'config': {}},
    'kernel_regularizer': None,
    'recurrent_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constra

In [10]:
# Simple RNN
def simpleRNN(input_shape):
    
    # LSTM model
    model = models.Sequential()
    model.add(layers.SimpleRNN(50, input_shape = input_shape, activation = 'relu'))

    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

prepare_3d = False
for feature_type, data in cond_dict.items():

    print(f'---- {feature_type} ----')
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], period, -1)
        temp_test = temp_test.reshape(temp_test.shape[0], period, -1)

    model = simpleRNN((temp_train.shape[1], temp_train.shape[2]))
    history = model.fit(temp_train, y_train, epochs=5)

    print('Training Accuracy')
    print(model.evaluate(temp_train, y_train))

    print('Testing Accuracy')
    print(model.evaluate(temp_test, y_test))

---- All ----
Epoch 1/5


2022-11-14 18:25:56.198380: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Accuracy
   4/3418 [..............................] - ETA: 1:05 - loss: 0.5284 - accuracy: 0.6562 

2022-11-14 19:03:47.332399: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.5149776935577393, 0.7457315325737]
Testing Accuracy
[0.8708304762840271, 0.5759038329124451]
---- Candlestick+News ----
Epoch 1/5


2022-11-14 19:05:57.544164: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Accuracy
   1/3418 [..............................] - ETA: 18:11 - loss: 0.8465 - accuracy: 0.4688

2022-11-14 19:38:58.385086: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.5019160509109497, 0.7512094378471375]
Testing Accuracy
[0.6561606526374817, 0.6898418664932251]
---- Candlestick+TA ----
Epoch 1/5


2022-11-14 19:52:46.910521: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Accuracy


2022-11-14 20:07:33.148386: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.6306959986686707, 0.6498458981513977]
Testing Accuracy
[0.7148867249488831, 0.5322914719581604]
---- CandlestickOnly ----
Epoch 1/5


2022-11-14 22:31:17.844724: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Accuracy
   1/3418 [..............................] - ETA: 15:32 - loss: 0.8315 - accuracy: 0.4688

2022-11-15 00:27:41.659504: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.5027225613594055, 0.750523567199707]
Testing Accuracy
[0.8874157071113586, 0.5774336457252502]


In [15]:
def cnn(input_shape):
    
    # function for creating a vgg block
    def vgg_block(layer_in, n_filters, n_conv):
        # add convolutional layers
        for _ in range(n_conv):
            layer_in = layers.Conv2D(n_filters, (3,3), padding='same', activation='relu')(layer_in)
        # add max pooling layer
        layer_in = layers.MaxPooling2D((2,2), strides=(2,2))(layer_in)
        return layer_in

    # define model input
    visible = layers.Input(shape=input_shape)

    # add vgg module
    layer = vgg_block(visible, 16, 2)
    layer = vgg_block(layer, 32, 2)
    layer = vgg_block(layer, 64, 3)

    layer = layers.Flatten()(layer)
    layer = layers.Dense(1028, activation = 'relu')(layer)
    layer = layers.Dense(64, activation = 'relu')(layer)
    layer = layers.Dense(1, activation = 'sigmoid')(layer)

    model = models.Model(inputs = visible, outputs = layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

temp_train = data['train']
temp_test = data['test']

train_shape = temp_train.shape
temp_train = temp_train.reshape(train_shape[0], train_shape[3], train_shape[1], train_shape[2])
test_shape = temp_test.shape
temp_test = temp_test.reshape(test_shape[0], test_shape[3], test_shape[1], test_shape[2])
print(temp_train.shape, temp_test.shape)

dim = (temp_train.shape[1], temp_train.shape[2], temp_train.shape[3])
model = cnn(dim)
model.summary()

(109349, 13, 12, 7) (42488, 13, 12, 7)
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 13, 12, 7)]       0         
                                                                 
 conv2d_7 (Conv2D)           (None, 13, 12, 16)        1024      
                                                                 
 conv2d_8 (Conv2D)           (None, 13, 12, 16)        2320      
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 6, 6, 16)         0         
 2D)                                                             
                                                                 
 conv2d_9 (Conv2D)           (None, 6, 6, 32)          4640      
                                                                 
 conv2d_10 (Conv2D)          (None, 6, 6, 32)          9248      
                    