In [1]:
import sys
sys.path.append(r'C:\Users\Jameshuckle\Dropbox\My-Portfolio\AlgorithmicTrading\utils')
from trading_util import (download_data_local_check, prep_stock_data, prep_fx_data, calc_sharpe, calc_romad)

import numpy as np
import pandas as pd
import os
import pickle
from sklearn.preprocessing import StandardScaler
import mplfinance as mpf

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model

print('yup')

yup


In [2]:
data_source = 'fx' # 'fx', 'stock'

if data_source == 'fx':
    ### FX data #######
    fx_files = [
                 'EURUSD_1h_2003-2020.csv',
                 'USDJPY_1h_2003-2020.csv',
                 'NZDUSD_1h_2003-2020.csv',
                 'AUDUSD_1h_2003-2020.csv',
                 'USDCAD_1h_2003-2020.csv',
                 ]

    loaded_files = prep_fx_data(fx_files)
        
if data_source == 'stock':
    ### stock data ######
    start = '2000-01-01'
    end = '2020-04-28'
    ## download data
    all_stock_data = download_data_local_check('SP500', start, end)
    loaded_files = prep_stock_data(all_stock_data, filter_start_date_tuple=None) #(2015,1,1)

EURUSD_1h_2003-2010.csv
EURUSD_1h_2010-2020.csv
USDJPY_1h_2003-2010.csv
USDJPY_1h_2010-2020.csv
NZDUSD_1h_2003-2020.csv
AUDUSD_1h_2003-2020.csv
USDCAD_1h_2003-2020.csv


#### Glove candlestick embeddings

In [3]:
def open_diff(numpy_ohlc):
    diff_data_numpy = np.zeros(numpy_ohlc.shape)
    # open vs close
    diff_data_numpy[1:,0] = (numpy_ohlc[1:,0] - numpy_ohlc[:-1,3]) / numpy_ohlc[:-1,3]
    # high, low, close vs open
    diff_data_numpy[:,[1,2,3]] = ((numpy_ohlc[:,[1,2,3]].T - numpy_ohlc[:,0]) / numpy_ohlc[:,0]).T 
    return diff_data_numpy

def remove_dates(raw_data):
    dates = raw_data.index
    raw_data = raw_data.reset_index(drop=True)
    return raw_data, dates

def diff(raw_data):
    diff_data = open_diff(raw_data)
    diff_data = pd.DataFrame(diff_data, columns=['open_diff','high_diff','low_diff','close_diff'])
    return diff_data

def scale(diff_data, train=True):
    if train:
        global data_scaler
        data_scaler = StandardScaler()
        scale_data = data_scaler.fit_transform(diff_data)
    else:
        scale_data = data_scaler.transform(diff_data)
    scale_data = pd.DataFrame(scale_data, columns=['open_scale','high_scale','low_scale','close_scale'])
    return scale_data

def scale_bins(scale_data, num_bins=5):
    cols = ['open_scale','high_scale','low_scale','close_scale']
    for col in cols:
        scale_data[f'{col}_bins'] = pd.cut(scale_data[col], num_bins, labels=False)
    for col in cols:
        scale_data[f'{col}_bins_label'] = pd.cut(scale_data[col], num_bins)
        
    bin_cols = [f'{col}_bins' for col in cols]
    scale_data[bin_cols] = scale_data[bin_cols].astype(int).astype(str)
    scale_data['label'] = scale_data[bin_cols].agg(''.join, axis=1)
    return scale_data

def create_candlestick_corpus(raw_data, train=True, pandas_with_dates=True):
    if pandas_with_dates:
        raw_data, dates = remove_dates(raw_data)
        diff_data = diff(raw_data.to_numpy())
    else:
        diff_data = diff(raw_data)
        raw_data = pd.DataFrame(raw_data)
        raw_data.columns = ['Open','High','Low','Close']
    scale_data = scale(diff_data, train=train)
    scale_data_bins = scale_bins(scale_data, num_bins=[-np.inf, -1.5, -1, -0.6, -0.1, 0.1, 0.6, 1, 1.5, np.inf])
    data = pd.concat([raw_data, scale_data_bins['label']], axis=1)
    if pandas_with_dates:
        data.index = dates
    else:
        data = data.to_numpy()
    return data

def create_candlestick_corpus_all():
    all_raw_data = []
    all_scale_data = []
    for file, data in loaded_files.items():
        raw_data = data[['Open','High','Low','Close']]
        raw_data = raw_data.resample('1D').agg({'Open':'first','High':'max','Low':'min','Close':'last'})
        raw_data.dropna(inplace=True)    
        raw_data, dates = remove_dates(raw_data)
        diff_data = diff(raw_data.to_numpy())
        scale_data = scale(diff_data, train=True)
        all_raw_data.append(raw_data) 
        all_scale_data.append(scale_data) 
    raw_data = pd.concat(all_raw_data, axis=0)
    scale_data = pd.concat(all_scale_data, axis=0)
    scale_data_bins = scale_bins(scale_data, num_bins=[-np.inf, -1.5, -1, -0.6, -0.1, 0.1, 0.6, 1, 1.5, np.inf])
    data = pd.concat([raw_data, scale_data_bins['label']], axis=1)
    return data

def plot_candlestick_types(candle_one_str, candle_two_str, num_candles):
    s  = mpf.make_mpf_style(base_mpf_style='yahoo', rc={'font.size':20})
    candle_one_filter = all_data.query('label == @candle_one_str').head(num_candles)
    candle_two_filter = all_data.query('label == @candle_two_str').head(num_candles)
    print('there are',len(candle_one_filter), 'candle_one')
    print('there are',len(candle_two_filter), 'candle_two')
    filtered = pd.concat([candle_one_filter, candle_two_filter], axis= 0)[['open_diff','high_diff','low_diff','close_diff']]
    filtered.columns = ['Open','High','Low','Close']
    filtered[['High','Low','Close']] = (filtered[['High','Low','Close']].T + filtered['Open']).T
    mpf.plot(filtered, type='candle', figscale=2)

# #all_data = all_data_steps(raw_data, train=True)
# #plot_candlestick_types(candle_one_str='4444', candle_two_str='8868', num_candles=10)
# corpus_data = create_candlestick_corpus(raw_data, train=True, pandas_with_dates=True)

corpus_data = create_candlestick_corpus_all()
corpus_data.head()

Unnamed: 0,Open,High,Low,Close,label
0,1.12284,1.12338,1.1216,1.12169,4263
1,1.12161,1.13009,1.12014,1.12924,4667
2,1.12921,1.14506,1.12723,1.14234,4858
3,1.14218,1.14323,1.13265,1.13494,3211
4,1.13507,1.15077,1.13006,1.1482,5848


In [4]:
corpus_data.shape

(25858, 5)

In [5]:
corpus = corpus_data['label'].to_numpy()
words_to_index = {word:idx for idx, word in enumerate(set(corpus))}
corpus_ids = [words_to_index[word] for word in corpus]

In [6]:
def asymetric_window_co_occurrence_matrix(window=10):
    co_occur = {}
    for idx in range(len(corpus_ids) - window):
        candles_window = corpus_ids[idx: idx + window]
        target_word = candles_window[-1]
        co_occur.setdefault(target_word, {})
        for score, candle in enumerate(candles_window[:-1]):
            co_occur.setdefault(candle, {}).setdefault(target_word, 0)
            co_occur[target_word].setdefault(candle, 0) 
            co_occur[target_word][candle] += ((score + 1))/10   
            co_occur[candle][target_word] += ((score + 1))/10
    return co_occur

co_occur = asymetric_window_co_occurrence_matrix(window=10)

In [7]:
# co_occur_matrix = pd.DataFrame(co_occur)
# co_occur_matrix = co_occur_matrix.loc[co_occur_matrix.columns]
# co_occur_matrix.fillna(0, inplace=True)

In [8]:
def create_glove_model(vocab_size, vector_size):
    
    w_i = layers.Input(shape=(1,))
    w_j = layers.Input(shape=(1,))

    emb_i = layers.Flatten()(layers.Embedding(vocab_size, vector_size, input_length=1)(w_i))
    emb_j = layers.Flatten()(layers.Embedding(vocab_size, vector_size, input_length=1)(w_j))

    ij_dot = layers.Dot(axes=-1)([emb_i, emb_j])
    
    b_i = layers.Flatten()(layers.Embedding(vocab_size, 1, input_length=1)(w_i))
    b_j = layers.Flatten()(layers.Embedding(vocab_size, 1, input_length=1)(w_j))

    pred = layers.Add()([ij_dot, b_i, b_j])

    model = Model(inputs=[w_i, w_j], outputs=pred)
    return model
    
def glove_loss(y_true, y_pred):
    alpha = 0.75
    x_max = 100
    f_x = K.pow(K.maximum(y_true, x_max) / x_max, alpha)
    loss = f_x * K.square(y_pred - K.log(y_true))
    return K.sum(loss)

In [9]:
def create_input(co_occur):
    first, second, x_ijs = [], [], []

    for first_id in co_occur.keys():
        for second_id in co_occur[first_id].keys():
            x_ij = co_occur[first_id][second_id]

            # add (main, context) pair
            first.append(first_id)
            second.append(second_id)
            x_ijs.append(x_ij)

            # add (context, main) pair
            first.append(second_id)
            second.append(first_id)
            x_ijs.append(x_ij)

    return np.array(first), np.array(second), np.array(x_ijs)

In [10]:
first_indices, second_indices, frequencies = create_input(co_occur)

In [11]:
vector_size = 4
model = create_glove_model(vocab_size=len(co_occur.keys()), vector_size=vector_size)

In [26]:
adam = tf.keras.optimizers.Adam(learning_rate=1e-6)
model.compile(loss=glove_loss, optimizer=adam)

In [27]:
model.fit([first_indices, second_indices], frequencies, epochs=20, batch_size=1000)

Train on 214770 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x20ea33d0d08>

In [24]:
embeddings = model.layers[2].get_weights()[0] + model.layers[3].get_weights()[0]
candlestick_embeddings = dict(zip(list(words_to_index.keys()), embeddings))
#candlestick_embeddings

In [25]:
with open(f'candlestick_embeddings_{vector_size}.pkl','wb') as f:
    pickle.dump(candlestick_embeddings, f)