In [1]:
# install libraries for keras
# !pip3 install keras
# !pip3 install h5py

import pandas as pd  
import matplotlib.pyplot as plt
import datetime, random
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense, Lambda
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers.merge import concatenate as concat
from keras import backend as K
from keras.optimizers import Adam
%matplotlib inline

Using TensorFlow backend.


## Implementing CVAE on label 

<H2>Import data</H2>


In [None]:
# Load directly from GitHub
stock_data_file = 'https://raw.githubusercontent.com/CNuge/kaggle-code/master/stock_data/all_stocks_5yr.csv'
stocks_df = pd.read_csv(stock_data_file, parse_dates=['date'])
stocks_df.head()

In [13]:
print('Stocks found:', len(set(stocks_df['Name'])))
print('Date range:', str(np.min(stocks_df['date']))[:10], '-', str(np.max(stocks_df['date']))[:10])

Stocks found: 505
Date range: 2013-02-08 - 2018-02-07


**Create quantitative features**

In [14]:
# create a couple of quantitative features 
stocks_df['day'] = stocks_df['date'].dt.weekday
stocks_df['day_of_month'] = stocks_df['date'].dt.day
stocks_df.head()

Unnamed: 0,date,open,high,low,close,volume,Name,day,day_of_month
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL,4,8
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL,0,11
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL,1,12
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL,2,13
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL,3,14


**Simplify by only using DOW 30 stocks instead of full S&P 500**

In [16]:
stock_symbols = list(set(stocks_df['Name']))

dow_30_list = ['NKE', 'AXP', 'BA', 'C', 'CAT', 'DD', 'DIS', 'GE', 'CSCO', 'HD', 'CVX', 'V', 'IBM',
'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'GS', 'MRK', 'MSFT', 'PFE', 'PG', 'AAPL', 'UTX', 'VZ',
'WMT', 'XOM', 'TRV']
 
dow_30_symbols = (list(set(dow_30_list) & set(stock_symbols)))


In [17]:
# apply some more quantitative features but make sure it doesn't spill from 
# one stock to another
market_data = {}
for symbol in dow_30_symbols:
    # segregate data for that stock so it doesn't spill into other one
    temp_df = stocks_df[stocks_df['Name'] == symbol]
    open = temp_df['open']
    high = temp_df['high']
    low = temp_df['low']
    close = temp_df['close']
    volume = temp_df['volume']
    close_avg_3 =  pd.rolling_mean(temp_df['close'], 3)
    close_avg_5 =  pd.rolling_mean(temp_df['close'], 5)
    close_avg_10 =  pd.rolling_mean(temp_df['close'], 10)
    open_close = temp_df['close'] - temp_df['open']
    day_ = temp_df['day']
    day_of_month = temp_df['day_of_month']
    date = temp_df['date']
    # day of month
    
    market_data[symbol] = [open, high, low, close,
                                volume, close_avg_3, close_avg_5, close_avg_10, open_close, day_,
                               day_of_month, date]
                               

In [181]:
# rebuild data frame
fin_open = []
fin_high = []
fin_low = []
fin_close = []
fin_volume = []
fin_close_avg_3 = []
fin_close_avg_5 = []
fin_close_avg_10 = []
fin_open_close = []

fin_day_ = []
fin_day_of_month = []
fin_date = []
fin_symbol = []

fin_high_high_diff3 = []
fin_low_low_diff3 = []
fin_open_open_diff3 = []
fin_close_close_diff3 = []
fin_volume_volume_diff3 = []


for key, value in market_data.items():
    fin_open.extend(list(value[0]))
    fin_high.extend(list(value[1]))
    fin_low.extend(list(value[2]))
    fin_close.extend(list(value[3]))
    fin_volume.extend(list(value[4]))
    fin_close_avg_3.extend(list(value[5]))
    fin_close_avg_5.extend(list(value[6]))
    fin_close_avg_10.extend(list(value[7]))
    fin_open_close.extend(list(value[8]))
    fin_day_.extend(list(value[9]))
    fin_day_of_month.extend(list(value[10]))
    fin_date.extend(list(value[11]))
    fin_symbol.extend([key] * len(value[11]))

# build final data frame for CVAE
norm_stocks_df = pd.DataFrame({'symbol':fin_symbol, 
                            'open':fin_open,
                            'high':fin_high,
                            'low':fin_low,
                            'close':fin_close,
                            'raw_close':fin_close,
                            'volume':fin_volume,
                            'raw_volume':fin_volume,
                            'close_avg_3':fin_close_avg_3,
                            'close_avg_5':fin_close_avg_5,
                            'close_avg_10':fin_close_avg_10,
                            'open_close':fin_open_close,
                            'raw_open_close':fin_open_close,
                            'day':fin_day_,
                            'day_of_month':fin_day_of_month,
                            'date':fin_date})

# remove any rows with NaN or inf columns
norm_stocks_df = norm_stocks_df.replace([np.inf, -np.inf], np.nan)
norm_stocks_df = norm_stocks_df.dropna(how='any')

norm_stocks_df.head(5)

Unnamed: 0,close,close_avg_10,close_avg_3,close_avg_5,date,day,day_of_month,high,low,open,open_close,raw_close,raw_open_close,raw_volume,symbol,volume
9,38.52,37.804,37.986667,37.81,2013-02-22,4,22,38.52,37.71,37.76,0.76,38.52,0.76,20102745,KO,20102745
10,37.72,37.699,37.983333,37.87,2013-02-25,0,25,38.72,37.72,38.56,-0.84,37.72,-0.84,15879793,KO,15879793
11,38.11,37.649,38.116667,37.958,2013-02-26,1,26,38.19,37.775,37.95,0.16,38.11,0.16,15769957,KO,15769957
12,38.45,37.738,38.093333,38.102,2013-02-27,2,27,38.54,37.92,38.03,0.42,38.45,0.42,14277317,KO,14277317
13,38.72,37.889,38.426667,38.304,2013-02-28,3,28,38.97,38.42,38.47,0.25,38.72,0.25,19010564,KO,19010564


**Normalize And Impute Missing Data**

In [218]:
features = [f for f in list(norm_stocks_df) if f not in ['date', 'symbol', 'raw_close', 'raw_open_close', 'raw_volume']]

df = norm_stocks_df.copy()

# Take all numerical cols and normalize data b/w 0 and 1
df[features] = df[features].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

print(features)

df['symbol'].value_counts()

['close', 'close_avg_10', 'close_avg_3', 'close_avg_5', 'day', 'day_of_month', 'high', 'low', 'open', 'open_close', 'volume']


TRV     1250
CVX     1250
PG      1250
MRK     1250
DIS     1250
KO      1250
CSCO    1250
UTX     1250
HD      1250
NKE     1250
JNJ     1250
WMT     1250
AAPL    1250
INTC    1250
GS      1250
CAT     1250
VZ      1250
AXP     1250
JPM     1250
MSFT    1250
MMM     1250
GE      1250
C       1250
PFE     1250
V       1250
XOM     1250
MCD     1250
IBM     1250
BA      1250
Name: symbol, dtype: int64


**CVAE on label + one hot encoding**

In [183]:

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

label = df['symbol']

le= LabelEncoder().fit(label)
encoded_Y = le.transform(label)  # convert categorical labels to integers
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

print(dummy_y[:4])

num_symbols = max(encoded_Y)
print(num_symbols)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]]
28


In [184]:
le.classes_

array(['AAPL', 'AXP', 'BA', 'C', 'CAT', 'CSCO', 'CVX', 'DIS', 'GE', 'GS',
       'HD', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK',
       'MSFT', 'NKE', 'PFE', 'PG', 'TRV', 'UTX', 'V', 'VZ', 'WMT', 'XOM'],
      dtype=object)

**Split the dataset into train, validation and test sets**

Save a copy of the full data for a particular stock to investigate (in this case symbol 0, or AAPL)

In [240]:
###################################################
## save the stock of interest for anomaly testing
stock_symbol_identifier = 0
print('Generate stock data for:', 
      le.classes_[stock_symbol_identifier])
###################################################

df_stock = df[df['symbol']==le.classes_[stock_symbol_identifier]].tail(100)
stock_dummy = dummy_y[1150:1250]
stock_dummy.fill(0)
stock_dummy[:,stock_symbol_identifier] = 1

 
## Break into train and test dataset
features = [f for f in list(norm_stocks_df)]
X_train, X_test, y_train, y_test = train_test_split(df[features], dummy_y, test_size=0.20, random_state=26)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.10, random_state=89)

# take copy of test set with symbol and date features for anomaly investigation
X_test_with_symbol = X_test.copy()
features = [f for f in list(norm_stocks_df) if f not in ['date', 'symbol',  'raw_close', 'raw_open_close', 'raw_volume']]
X_test = X_test[features]
X_train = X_train[features]
X_valid = X_valid[features]

print("X shape: ", X_train.shape, X_valid.shape, X_test.shape)
print("Y shape: ", y_train.shape, y_valid.shape, y_test.shape)

Generate stock data for: AAPL
X shape:  (26100, 11) (2900, 11) (7250, 11)
Y shape:  (26100, 29) (2900, 29) (7250, 29)


In [241]:
features

['close',
 'close_avg_10',
 'close_avg_3',
 'close_avg_5',
 'day',
 'day_of_month',
 'high',
 'low',
 'open',
 'open_close',
 'volume']

In [242]:
X_train.tail()

Unnamed: 0,close,close_avg_10,close_avg_3,close_avg_5,day,day_of_month,high,low,open,open_close,volume
1954,0.225945,0.237173,0.228924,0.234242,0.5,0.333333,0.226425,0.228708,0.227291,0.362824,0.009354
7205,0.21539,0.219025,0.21637,0.218444,0.25,0.633333,0.212675,0.217799,0.213145,0.399712,0.054506
35230,0.21118,0.211297,0.212142,0.213852,0.0,0.233333,0.207477,0.213125,0.207938,0.410086,0.017189
36308,0.147911,0.151045,0.148307,0.150013,0.75,0.633333,0.145647,0.149498,0.146494,0.398559,0.035593
6455,0.182148,0.192645,0.184817,0.188841,1.0,0.866667,0.180318,0.183974,0.182047,0.381844,0.010108


In [243]:
## discard incomplete batches
m = 50 # batch size

num_batches_train = X_train.shape[0]//m
print(num_batches_train)
X_train_trun = X_train.head(num_batches_train*m)
y_train_trun = y_train[:num_batches_train*m,:]
print(X_train_trun.shape, y_train_trun.shape)

num_batches_valid = X_valid.shape[0]//m
X_valid_trun = X_valid.head(num_batches_valid*m)
y_valid_trun = y_valid[:num_batches_valid*m,:]
print(X_valid_trun.shape, y_valid_trun.shape)

num_batches_test = X_test.shape[0]//m
X_test_trun = X_test.head(num_batches_test*m)
y_test_trun = y_test[:num_batches_test*m,:]
print(X_test_trun.shape, y_test_trun.shape)

522
(26100, 11) (26100, 29)
(2900, 11) (2900, 29)
(7250, 11) (7250, 29)


<H2> I will Build next a CVAE</H2>