In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt

from datetime import date, datetime, timedelta

In [2]:
df_stock_ta = pd.read_csv('temp/stock_ta.csv')
df_stock_cdl = pd.read_csv('temp/stock_cdl.csv')
df_ind_ta = pd.read_csv('temp/ind_ta.csv')
df_ind_cdl = pd.read_csv('temp/ind_cdl.csv')
df_stock_news = pd.read_csv('temp/stock_news.csv')
df_ind_news = pd.read_csv('temp/ind_news.csv')

df_tar = pd.read_csv('temp/target.csv')
df_tar = df_tar[['report_date', 'ticker', 'label']]
# df_stock_ta = df_stock_ta.drop(['open', 'high', 'low', 'close', 'volume', 'adjusted_close'], axis = 1)
# df_ind_ta = df_ind_ta.drop(['open', 'high', 'low', 'close', 'volume', 'adjusted_close'], axis = 1)

# Prepare Cross Industrial Feature
df_inds = df_ind_cdl.merge(df_ind_news, on = ['inds', 'report_date'], how = 'left')
df_inds = df_inds.merge(df_ind_ta, on = ['report_date', 'inds'])
df_inds = df_inds.fillna(method = 'ffill')

In [3]:
period = 7; train_size = 0.8; prepare_3d = False
require_features = ['cdl', 'ta', 'news']
inds_num = len(df_inds['inds'].unique())

price_cols = ['open', 'high', 'low', 'close', 'volume']
cdl_cols = ['morning_star', 'evening_star', 'hammer',
       'inverted_hammer', 'bullish_engulfing', 'bearish_engulfing',
       'shooting_star', 'hanging_man']
news_cols = ['compound', 'neg', 'neu', 'pos', 'new_cases',
       'total_cases', 'total_deaths', 'new_deaths', 'total_deaths_nd']
ta_cols = df_stock_ta.drop(['report_date', 'ticker'], axis = 1).columns

In [4]:
# Prepare Train test dataset preparation
X_train, y_train, X_test, y_test = [], [], [], []

stocks = df_tar['ticker'].unique()
for stock in stocks:
    
    print('Current Stock: ', stock)
    temp_stock_cdl = df_stock_cdl[df_stock_cdl['ticker'] == stock]
    temp_stock_ta = df_stock_ta[df_stock_ta['ticker'] == stock]
    temp_stock_news = df_stock_news[df_stock_news['ticker'] == stock]
    temp_stock_tar = df_tar[df_tar['ticker'] == stock]
    
    # Merge results
    data = temp_stock_tar.merge(temp_stock_cdl, on = ['report_date', 'ticker'])
    data = data.merge(temp_stock_ta, on = ['report_date', 'ticker'])
    data = data.merge(temp_stock_news, on = ['report_date', 'ticker'], how = 'left')
    data = data.fillna(method = 'ffill')
    
    check = True
    for i in range(period, len(data)):
        
        temp = []
        # Create historical data (within period)
        temp_period = data.iloc[i - period:i,]
        if temp_period.isnull().sum().sum() > 0:
            continue

        # Create stock feature space
        feature_dfs = []
        feature_dfs.append(temp_period[price_cols])
        
        if 'cdl' in require_features:
            feature_dfs.append(temp_period[cdl_cols])
            
        if 'news' in require_features:
            feature_dfs.append(temp_period[news_cols])
            
        if 'ta' in require_features:
            feature_dfs.append(temp_period[ta_cols])
            
        temp_stock_feature = pd.concat(feature_dfs, axis = 1).to_numpy()
        temp.append(temp_stock_feature)
        
        # Get industrial feature space
        for ind in sorted(list(df_inds['inds'].unique())):
            temp_period_inds = df_inds[df_inds['inds'] == ind].reset_index(drop = True).iloc[i - period:i,]
            
            feature_ind_dfs = []
            feature_ind_dfs.append(temp_period_inds[price_cols])
            if 'cdl' in require_features:
                feature_ind_dfs.append(temp_period_inds[cdl_cols])

            if 'news' in require_features:
                feature_ind_dfs.append(temp_period_inds[news_cols])

            if 'ta' in require_features:
                feature_ind_dfs.append(temp_period_inds[ta_cols])
                
            temp_ind_feature = pd.concat(feature_ind_dfs, axis = 1).to_numpy()
            temp.append(temp_ind_feature)
        
        # Create label 
        y_label = temp_period.iloc[-1]['label']
        
        if i <= int((len(data) - period) * train_size):
            
#             if check:
#                 print('train', i)
#                 print('max report_date: ', max(temp_period['report_date']), 'min report_date: ', min(temp_period['report_date']))
#                 print('Whole feature shape: ', np.array(temp).shape)
#                 print('label: ', y_label)
#                 check = False

            X_train.append(temp)
            y_train.append(y_label)
        else: 
            X_test.append(temp)
            y_test.append(y_label)
        
X_train, y_train = np.array(X_train).astype('float32'), np.array(y_train)
X_test, y_test = np.array(X_test).astype('float32'), np.array(y_test)



Current Stock:  MMM
Current Stock:  AOS
Current Stock:  ABT
Current Stock:  ABBV
Current Stock:  ABMD
Current Stock:  ACN
Current Stock:  ATVI
Current Stock:  ADM
Current Stock:  ADBE
Current Stock:  ADP
Current Stock:  AAP
Current Stock:  AES
Current Stock:  AFL
Current Stock:  A
Current Stock:  APD
Current Stock:  AKAM
Current Stock:  ALK
Current Stock:  ALB
Current Stock:  ARE
Current Stock:  ALGN
Current Stock:  ALLE
Current Stock:  LNT
Current Stock:  ALL
Current Stock:  GOOGL
Current Stock:  GOOG
Current Stock:  MO
Current Stock:  AMZN
Current Stock:  AMCR
Current Stock:  AMD
Current Stock:  AEE
Current Stock:  AAL
Current Stock:  AEP
Current Stock:  AXP
Current Stock:  AIG
Current Stock:  AMT
Current Stock:  AWK
Current Stock:  AMP
Current Stock:  ABC
Current Stock:  AME
Current Stock:  AMGN
Current Stock:  APH
Current Stock:  ADI
Current Stock:  ANSS
Current Stock:  AON
Current Stock:  APA
Current Stock:  AAPL
Current Stock:  AMAT
Current Stock:  APTV
Current Stock:  ANET
Curre

Current Stock:  RCL
Current Stock:  SPGI
Current Stock:  CRM
Current Stock:  SBAC
Current Stock:  SLB
Current Stock:  STX
Current Stock:  SEE
Current Stock:  SRE
Current Stock:  NOW
Current Stock:  SHW
Current Stock:  SBNY
Current Stock:  SPG
Current Stock:  SWKS
Current Stock:  SJM
Current Stock:  SNA
Current Stock:  SEDG
Current Stock:  SO
Current Stock:  LUV
Current Stock:  SWK
Current Stock:  SBUX
Current Stock:  STT
Current Stock:  STE
Current Stock:  SYK
Current Stock:  SIVB
Current Stock:  SYF
Current Stock:  SNPS
Current Stock:  SYY
Current Stock:  TMUS
Current Stock:  TROW
Current Stock:  TTWO
Current Stock:  TPR
Current Stock:  TGT
Current Stock:  TEL
Current Stock:  TDY
Current Stock:  TFX
Current Stock:  TER
Current Stock:  TSLA
Current Stock:  TXN
Current Stock:  TXT
Current Stock:  TMO
Current Stock:  TJX
Current Stock:  TSCO
Current Stock:  TT
Current Stock:  TDG
Current Stock:  TRV
Current Stock:  TRMB
Current Stock:  TFC
Current Stock:  TWTR
Current Stock:  TYL
Current

In [8]:
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

X_train shape:  (109349, 12, 7, 38)
y_train shape:  (109349,)
X_test shape:  (42488, 12, 7, 38)
y_test shape:  (42488,)


In [9]:
# Divide to multiple combinations
cond_dict = {}
feature_idx_dict = {name: ind for ind, name in enumerate(pd.concat(feature_dfs, axis = 1).columns)}
adj_news_cols = ['compound', 'pos', 'new_deaths', 'new_deaths_nd']

# Candlestick + TA + News
feature_idx = [idx for name, idx in feature_idx_dict.items() \
                   if name in price_cols + cdl_cols + adj_news_cols + list(ta_cols)]
cond_dict['All'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick + News
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols + adj_news_cols]
cond_dict['Candlestick+News'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick + TA
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols + list(ta_cols)]
cond_dict['Candlestick+TA'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

# Candlestick Only
feature_idx = [idx for name, idx in feature_idx_dict.items() if name in price_cols + cdl_cols]
cond_dict['CandlestickOnly'] = {'train': X_train[:, :, :, feature_idx], 'test': X_test[:, :, :, feature_idx]}

## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

# Loop over each feature type and fit the model
for feature_type, data in cond_dict.items():
    
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], -1)
        temp_test = temp_test.reshape(temp_test.shape[0], -1)

    clf = LogisticRegression(random_state = 7600)
    clf.fit(temp_train, y_train)

    print(f'Training Accuracy for {feature_type}: ', round(clf.score(temp_train, y_train) * 100, 2), '%')
    print(f'Testing Accuracy for {feature_type}: ', round(clf.score(temp_test, y_test) * 100, 2), '%')
    print()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy for All:  64.26 %
Testing Accuracy for All:  60.69 %



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy for Candlestick+News:  73.62 %
Testing Accuracy for Candlestick+News:  62.68 %



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy for Candlestick+TA:  64.01 %
Testing Accuracy for Candlestick+TA:  61.04 %



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy for CandlestickOnly:  74.0 %
Testing Accuracy for CandlestickOnly:  62.08 %



## KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Loop over each feature type and fit the model
for feature_type, data in cond_dict.items():
    
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], -1)
        temp_test = temp_test.reshape(temp_test.shape[0], -1)

    clf = KNeighborsClassifier()
    clf.fit(temp_train, y_train)

    print(f'Training Accuracy for {feature_type}: ', round(clf.score(temp_train, y_train) * 100, 2), '%')
    print(f'Testing Accuracy for {feature_type}: ', round(clf.score(temp_test, y_test) * 100, 2), '%')
    print()

Training Accuracy for All:  78.49 %
Testing Accuracy for All:  49.98 %

Training Accuracy for Candlestick+News:  76.28 %
Testing Accuracy for Candlestick+News:  53.11 %

Training Accuracy for Candlestick+TA:  78.54 %
Testing Accuracy for Candlestick+TA:  51.52 %

Training Accuracy for CandlestickOnly:  76.84 %
Testing Accuracy for CandlestickOnly:  55.51 %



# Decision Tree

In [12]:
# Decission Tree
from sklearn.tree import DecisionTreeClassifier

prepare_3d = False
# Loop over each feature type and fit the model
for feature_type, data in cond_dict.items():
    
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], -1)
        temp_test = temp_test.reshape(temp_test.shape[0], -1)

    clf = DecisionTreeClassifier(random_state = 7600)
    clf.fit(temp_train, y_train)

    print(f'Training Accuracy for {feature_type}: ', round(clf.score(temp_train, y_train) * 100, 2), '%')
    print(f'Testing Accuracy for {feature_type}: ', round(clf.score(temp_test, y_test) * 100, 2), '%')
    print()

Training Accuracy for All:  100.0 %
Testing Accuracy for All:  56.31 %

Training Accuracy for Candlestick+News:  100.0 %
Testing Accuracy for Candlestick+News:  54.83 %

Training Accuracy for Candlestick+TA:  100.0 %
Testing Accuracy for Candlestick+TA:  57.68 %

Training Accuracy for CandlestickOnly:  100.0 %
Testing Accuracy for CandlestickOnly:  59.48 %



## Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

# Loop over each feature type and fit the model
for feature_type, data in cond_dict.items():
    
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], -1)
        temp_test = temp_test.reshape(temp_test.shape[0], -1)

    clf = RandomForestClassifier(random_state = 7600)
    clf.fit(temp_train, y_train)

    print(f'Training Accuracy for {feature_type}: ', round(clf.score(temp_train, y_train) * 100, 2), '%')
    print(f'Testing Accuracy for {feature_type}: ', round(clf.score(temp_test, y_test) * 100, 2), '%')
    print()

Training Accuracy for All:  100.0 %
Testing Accuracy for All:  61.82 %

Training Accuracy for Candlestick+News:  100.0 %
Testing Accuracy for Candlestick+News:  49.68 %

Training Accuracy for Candlestick+TA:  100.0 %
Testing Accuracy for Candlestick+TA:  62.57 %

Training Accuracy for CandlestickOnly:  100.0 %
Testing Accuracy for CandlestickOnly:  62.01 %



# XGBoost

In [35]:
import xgboost as xgb

param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
param['eval_metric'] = 'auc'
num_round = 100

# Loop over each feature type and fit the model
for feature_type, data in cond_dict.items():
    
    temp_train = data['train']
    temp_test = data['test']

    if not prepare_3d:
        temp_train = temp_train.reshape(temp_train.shape[0], -1)
        temp_test = temp_test.reshape(temp_test.shape[0], -1)

    dtrain = xgb.DMatrix(temp_train, label = y_train)
    dtest = xgb.DMatrix(temp_test, label = y_test)
        
    bst = xgb.train(param, dtrain, num_round)

    # make prediction
    preds = bst.predict(dtest)

    print(f'Training Accuracy for {feature_type}: ', round(np.mean((bst.predict(dtrain) > 0.5) == y_train) * 100, 2), '%')
    print(f'Testing Accuracy for {feature_type}: ', round(np.mean((preds > 0.5) == y_test) * 100, 2), '%')


Training Accuracy for All:  77.85 %
Testing Accuracy for All:  62.6 %
Training Accuracy for Candlestick+News:  75.44 %
Testing Accuracy for Candlestick+News:  62.44 %
Training Accuracy for Candlestick+TA:  77.9 %
Testing Accuracy for Candlestick+TA:  64.86 %
Training Accuracy for CandlestickOnly:  75.15 %
Testing Accuracy for CandlestickOnly:  64.27 %
