In [1]:
import numpy as np
import pandas as pd
from glob import glob
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
from pandarallel import pandarallel
import matplotlib.dates as mdates
import time
pandarallel.initialize()
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from keras.layers import Flatten, Dense, Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, Flatten, Dense, Conv2D, BatchNormalization, LeakyReLU, Dropout, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K 

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)


Num GPUs Available:  1


In [3]:
# Read in pickle file
df = pd.read_pickle('../MSFT_2015_2020.pickle')

In [22]:
# Define time arr for every second
times = []
for hours in range(9,16):
    for minutes in range(0,60):
        for seconds in np.arange(0,60,1):
            h = str(hours) if hours>=10 else '0'+str(hours)
            m = str(minutes) if minutes>=10 else '0'+str(minutes)
            s = str(seconds) if seconds>=10 else '0'+str(seconds)
            times.append(h+':'+m+':'+s)
times = times[1801:]

In [24]:
days = df.resample('1D').median()
days = days.dropna().index
days = pd.Series(days).apply(lambda x: x.strftime('%Y-%m-%d')).values

In [None]:
def make_regular_df(df):
    """Make 1 sec regular dataframe from irregular one day dataframe of prices.
       Note, irregular_day_df should be a 1-day only dataframe with prices
       (no sizes here). If there is no trade, we take the previous time"""
    # Extract times from datetimes
    df['time'] = pd.Series(df.index.values).apply(lambda x: x.strftime('%H:%M:%S')).values
    # Make time indes
    df= df.set_index('time')
    # Save irregular spaced dataframe
    irregular_df = df.copy()
    # Get data for all datetimes, where we fill forward if there is no data
    df = df.reindex(times).fillna(method='ffill')

    df['logprice'] = np.log(df.PRICE)
    # Get deltalog and deltalog^2 prices
    df['deltalog'] = df.logprice.diff()*100
    df['deltalog2'] = (df['deltalog'])**2

    irregular_df['logprice'] = np.log(irregular_df.PRICE)
    # Get deltalog and deltalog^2 prices
    irregular_df['deltalog'] = irregular_df.logprice.diff()*100
    irregular_df['deltalog2'] = (irregular_df['deltalog'])**2
    return df, irregular_df

def worker(day):
    """worker to loop over days"""
    daily_counts = pd.read_hdf('../days.h5')
    counts = daily_counts.loc[day].iloc[0]
    # to be faster: find where we are approximately
    iloc0 = max(int(daily_counts.loc[:day].sum().iloc[0]-counts)-1000,0)
    hdf_df = pd.read_hdf('../data.h5')
    oneday = hdf_df.iloc[iloc0:counts+iloc0+1000].loc[day]
    regular_df, irregular_df = make_regular_df(oneday)
    return regular_df

reg_dfs = Parallel(n_jobs=23)(delayed(worker)(i) for i in days)
X = np.array([w.LOGPRICE.values for w in reg_dfs])

In [None]:
X = X.reshape(len(X),1,1,23399)

In [None]:
# Built output data (RVOL)
RV = (np.sqrt((np.log(df[['PRICE']].resample('5T').median()).diff()**2).resample('1D').sum())*100*np.sqrt(252))
RV = RV[RV.PRICE>0]
RV.columns = ['RealVol']
Y = RV.reindex(days).values.flatten()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=42)

In [None]:
input_layer = Input((1,1,23399))

x = Conv2D(filters = 128, kernel_size = 3, strides = 1, padding = 'same')(input_layer)
x = BatchNormalization()(x)
x = LeakyReLU()(x)


x = Conv2D(filters = 128, kernel_size = 3, strides = 2, padding = 'same')(x)
x = BatchNormalization()(x)
x = LeakyReLU()(x)


x = Conv2D(filters = 256, kernel_size = 3, strides = 1, padding = 'same')(x)
x = BatchNormalization()(x)
x = LeakyReLU()(x)


x = Conv2D(filters = 256, kernel_size = 3, strides = 2, padding = 'same')(x)
x = BatchNormalization()(x)
x = LeakyReLU()(x)


x = Flatten()(x)

x = Dense(128)(x)
x = BatchNormalization()(x)
x = LeakyReLU()(x)
x = Dropout(rate = 0.5)(x)

x = Dense(1)(x)
output_layer = Activation('relu')(x)

model = Model(input_layer, output_layer)

opt = Adam(lr=0.0005)
model.compile(loss='mse', optimizer=opt, metrics=['MeanSquaredError'])

model.fit(X_train
          , y_train
          , batch_size=16
          , epochs=100
          , shuffle=True
          , validation_data = (X_test, y_test))

In [None]:
plt.scatter(model(X_train).numpy().flatten(), y_train)

plt.scatter(model(X_test).numpy().flatten(), y_test)