In [2]:

import pandas as pd
import numpy as np # linear algebra
import glob
import os
import gc
import pickle

from joblib import Parallel, delayed

from sklearn import model_selection
import lightgbm as lgb

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
#%matplotlib inline
import plotly.express as px
from IPython.display import display
import seaborn as sns

from scipy.stats import binned_statistic
from scipy.stats import kurtosis, skew

from numba import jit


data_dir = '/home/optimusprime/Desktop/peeterson/optiver/Optiver-Realized-Volatility-Prediction/data'




In [3]:
# ## analyzed effect of log transformation on the skew and kurtosis and mean.
# book_data = pd.read_parquet(os.path.join(data_dir, 'book_{}.parquet/stock_id={}/'.format('train', 13)))
# book_data['log_bid_price1'] = book_data['bid_price1'].apply(np.log)
# book_data.hist(column=['bid_price1','log_bid_price1'], bins=100, figsize=(20,10))

# print(book_data['bid_price1'].describe())
# print(book_data['log_bid_price1'].describe())
# print(kurtosis(book_data['bid_price1']), skew(book_data['bid_price1'] ))
# print(kurtosis(book_data['log_bid_price1']), skew(book_data['log_bid_price1'] ))

In [4]:

def find_equilibrium_price(book_data, lvl, iterations=22):
    loga2 = np.array(book_data['log_ask_price2'])
    loga1 = np.array(book_data['log_ask_price1'])
    logb1 = np.array(book_data['log_bid_price1'])
    logb2 = np.array(book_data['log_bid_price2'])
    
    asize2 = np.array(book_data['ask_size2'])
    asize1 = np.array(book_data['ask_size1'])
    bsize1 = np.array(book_data['bid_size1'])
    bsize2 = np.array(book_data['bid_size2'])
    
    ub = loga1
    lb = logb1
    
    s = (-1)**lvl
    for iter in range(iterations):
        mid_price = (ub + lb)/2.0
        inv_diff_a2 = 1.0/( 1000*( mid_price - loga2 ) )
        inv_diff_a1 = 1.0/( 1000*( mid_price - loga1 ) )
        inv_diff_b1 = 1.0/( 1000*( mid_price - logb1 ) ) # negative
        inv_diff_b2 = 1.0/( 1000*( mid_price - logb2 ) ) # negative

        f  = -(   ( bsize2*inv_diff_b2**(lvl+1) + bsize1*inv_diff_b1**(lvl+1) )
              + s*( asize1*inv_diff_a1**(lvl+1) + asize2*inv_diff_a2**(lvl+1) ) )
        
        # when lvl = even, f is positive when buy side missing volume is larger than sell side missing volume
        # when lvl = even, f is negative when sell side missing volume is larger than buy side missing volume
        # when lvl = odd, f is positive when sell side missing volume is larger than buy side missing volume and vice versa

        dub = - (ub-lb)/2.0*(f>=0)
        dlb =   (ub-lb)/2.0*(f< 0)
        
        # when f is positive, mid price moves towards buy side (bid_price) by reducing the upper bound 
        # when f is negative, mid price moves towards sell side (ask_price) by increasing the lower bound 
        ub = ub + dub
        lb = lb + dlb

    equilibrium_price = (ub + lb)/2.0    
        
    return equilibrium_price
   

In [5]:
def diff(list_stock_prices):
    return list_stock_prices.diff() 

In [6]:
@jit()
def bucketized_summed_data(seconds_arr, time_id, data, buk_width, n_buks, time_ids_size):
    z = np.zeros( (time_ids_size,n_buks) ) # 30 buckets for 600 seconds (10 minutes)

    t_id  = 0
    for s in range(seconds_arr.shape[0]): # seconds.shape[0] is total size of the seconds column i.e. total rows in seconds column
        
        if time_id[s] != time_id[max(s-1,0)]:
            t_id = t_id + 1
            
        z[t_id, int(seconds_arr[s]//buk_width)] += data[s]            
    
    return z



  @jit()


In [7]:
@jit(nopython=True)
def end_bucket(buk_width, buk_sum:float, buk_weight:float, last_val, last_weight, last_time)->float: 
    dt = buk_width - last_time%buk_width
    
    buk_weight += 1.0*last_weight*dt
    buk_sum    += 1.0*last_weight*dt*last_val
    
    return float(buk_sum/(buk_weight + 1e-8))

In [8]:
@jit()
def bucketized_time_weighted_avg_data(seconds_arr, time_id_arr, data,weights, buk_width, n_buks, time_ids_size):

    z = np.zeros( (time_ids_size,n_buks) )
    
    prev_time   = 0
    prev_weight = 0.0
    prev_val    = 0.0
    
    buk_sum = 0.0
    buk_weight = 0.0

    t_id  = 0  # time id
    buk = 0  # bucket id
    for idx in range(seconds_arr.shape[0]): # seconds.shape[0] is total size of the seconds column i.e. total rows in seconds column
        
        if time_id_arr[idx] != time_id_arr[max(idx-1,0)]: # transition to new time id
            z[t_id, buk] = float(end_bucket(buk_width, buk_sum, buk_weight, prev_val, prev_weight, prev_time))
            buk += 1
            
            while buk < z.shape[1]:
                z[t_id, buk] = prev_val
                buk += 1
            t_id += 1
            buk = 0
            
            prev_time  = 0
            buk_sum    = 0.0
            buk_weight = 0.0            
            
        if int(seconds_arr[idx]//buk_width) != int(prev_time//buk_width): # transition to new bucket
            
            z[t_id, buk] = float(end_bucket(buk_width, buk_sum, buk_weight, prev_val, prev_weight, prev_time)) # end the previous bucket
            buk += 1 # move to next bucket
            
            while buk < seconds_arr[idx]//buk_width:
                z[t_id, buk] = prev_val
                buk += 1
            
            prev_time  = buk_width*(seconds_arr[idx]//buk_width)
            buk_sum    = 0.0
            buk_weight = 0.0
        
        buk_sum    += prev_val*prev_weight*(seconds_arr[idx] - prev_time)  # in the same bucket
        buk_weight +=          prev_weight*(seconds_arr[idx] - prev_time)  # in the same bucket
        
        prev_time   = seconds_arr[idx] # in the same bucket
        prev_val    = data[idx] # in the same bucket
        prev_weight = weights[idx] # in the same bucket
            
    z[t_id, buk] = end_bucket(buk_width, buk_sum, buk_weight, prev_val, prev_weight, prev_time)
    
    for buk in range(buk+1, z.shape[1]): # all buckets of the last time id
        z[t_id, buk] = prev_val
                             
    return z

  @jit()


In [9]:
def create_stock_data(st_id, dset):
    
    cols = ['st_id', 'time_id', 'seconds_in_bucket']
    
    ############################## BOOK DATA ##########################################

    book_data = pd.read_parquet(os.path.join(data_dir, 'book_{}.parquet/stock_id={}/'.format(dset, st_id)))

    book_data['st_id'] = st_id
        
    columns = cols + [col for col in book_data.columns if col not in cols]
    book_data = book_data[columns]
    # columns = 'st_id', 'time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2'
    
    # volume
    book_data['ask_volume1'] = book_data['ask_price1']*book_data['ask_size1']
    book_data['ask_volume2'] = book_data['ask_price2']*book_data['ask_size2']
    book_data['bid_volume1'] = book_data['bid_price1']*book_data['bid_size1']
    book_data['bid_volume2'] = book_data['bid_price2']*book_data['bid_size2']
    
    # becomes zero centered, reduces skew and kurtosis slightly bringing it slightly closer to normal for right skewed data, It is increases/worsens
    # skew and kurtosis for left-skewed data
    # correct way is to use box-cox transformation, variance stabilization
    book_data['log_ask_price1'] = np.log( book_data['ask_price1'] )
    book_data['log_ask_price2'] = np.log( book_data['ask_price2'] )
    book_data['log_bid_price1'] = np.log( book_data['bid_price1'] )
    book_data['log_bid_price2'] = np.log( book_data['bid_price2'] )
    
    # redefining WAP using log prices
    book_data['wap1_log_price'] = ( book_data['log_bid_price1'] * book_data['ask_size1'] + book_data['log_ask_price1'] * book_data['bid_size1'] ) / (book_data['bid_size1'] + book_data['ask_size1'])
    book_data['wap2_log_price'] = ( book_data['log_bid_price2'] * book_data['ask_size2'] + book_data['log_ask_price2'] * book_data['bid_size2'] ) / (book_data['bid_size2'] + book_data['ask_size2'])
    
    # Find equilibrium price at which trades are likely to happen
    # This price minimizes the missing total volume from buy and sell side
    book_data['wap_eqi_price0'] = find_equilibrium_price( book_data, lvl=0)
    book_data['wap_eqi_price1'] = find_equilibrium_price( book_data, lvl=1)
    # book_data['wap_eqi_price2'] = find_equilibrium_price( book_data, lvl=2)
    # book_data['wap_eqi_price3'] = find_equilibrium_price( book_data, lvl=3)
    # book_data['wap_eqi_price4'] = find_equilibrium_price( book_data, lvl=4)


    # equilibrium price has converged closer to 
    book_data['liquidity0'] = (
                  book_data['bid_volume1']/( 1000*(book_data['wap_eqi_price0'] - book_data['log_bid_price1']) )
                + book_data['bid_volume2']/( 1000*(book_data['wap_eqi_price0'] - book_data['log_bid_price2']) )
                - book_data['ask_volume1']/( 1000*(book_data['wap_eqi_price0'] - book_data['log_ask_price1']) )
                - book_data['ask_volume2']/( 1000*(book_data['wap_eqi_price0'] - book_data['log_ask_price2']) )
    )    

    # liquidity 0 and liquidity 1 are negatively correlated with each other, if one has prices moving towards buy side, the other has price moving towards sell side
    book_data['liquidity1'] = (
                  book_data['bid_volume1']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_bid_price1']) )
                + book_data['bid_volume2']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_bid_price2']) )
                - book_data['ask_volume1']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_ask_price1']) )
                - book_data['ask_volume2']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_ask_price2']) )
    )    
    
    book_data['spread']     = book_data['log_ask_price1'] - book_data['log_bid_price1']
    book_data['inv_spread'] = (book_data['log_ask_price1'] - book_data['log_bid_price1'])**-2 # inverse of spread has the effect of amplifying low values and diminishing high values
    book_data['log_spread'] = book_data['spread'].apply(np.log) # log of spread has the effect of amplifying low values and diminishing high values. It can normalize right skewed data
    book_data['log_spread2'] = np.log(book_data['log_ask_price2'] - book_data['log_bid_price2'])

    book_data['book_size1'] = book_data['ask_volume1'] + book_data['bid_volume1']
    book_data['book_size'] = book_data['ask_volume1'] + book_data['bid_volume1'] + book_data['ask_volume2'] + book_data['bid_volume2'] 

    # difference betweeen ask's level 1 and level 2 liquidity
    book_data['ask_liq1_diff'] = (
                  book_data['ask_volume1']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_ask_price1']) )**1
               -  book_data['ask_volume2']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_ask_price2']) )**1
    )

    # difference betweeen bid's level 1 and level 2 liquidity
    book_data['bid_liq1_diff'] = (
                  book_data['bid_volume1']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_bid_price1']) )**1
               -  book_data['bid_volume2']/( 1000*(book_data['wap_eqi_price1'] - book_data['log_bid_price2']) )**1
    )

    # simple returns on prices wap1_log_price,wap2_log_price,wap_eqi_price0,wap_eqi_price1
    book_data['wap1_log_price_ret' ] = book_data.groupby(by = ['time_id'])['wap1_log_price'].apply(diff).fillna(0).values
    book_data['wap2_log_price_ret' ] = book_data.groupby(by = ['time_id'])['wap2_log_price'].apply(diff).fillna(0).values
    book_data['wap_eqi_price0_ret'] = book_data.groupby(by = ['time_id'])['wap_eqi_price0'].apply(diff).fillna(0).values
    book_data['wap_eqi_price1_ret'] = book_data.groupby(by = ['time_id'])['wap_eqi_price1'].apply(diff).fillna(0).values

    # this indicates the changes in level 2 wap when level 1 wap does NOT change
    # This happens when all orders in level 1 are filled and new orders are placed in level 2
    # indication of liquidity as prices in level 2 are moving towards level 1
    # Aggressive Market Orders, Imbalance in Market Depth, Execution of Large Orders, Liquidity Changes: 
    book_data['wap2_log_price_ret_changes_n_wap1_log_price_ret_constant'] = book_data['wap2_log_price_ret' ]*(book_data['wap1_log_price_ret' ]==0)
    
    # variance stabilization of right skewed data.
    book_data['log_liquidity1'] = np.log(book_data['liquidity1'])

    # simple returns on liquidity / first order changes in liquidity
    book_data['log_liquidity1_ret'] = book_data.groupby(by = ['time_id'])['log_liquidity1'].apply(diff).fillna(0).values
    # simple returns on log_spread / first order changes in log_spread
    book_data['log_spread_ret'] = book_data.groupby(by = ['time_id'])['log_spread'].apply(diff).fillna(0).values

    # wap1 price returns when liquidity1 is positive/increases and negative/decreases    
    book_data['wap1_log_price_ret_pos_log_liq_ret'] = (book_data['log_liquidity1_ret']>0)*book_data['wap1_log_price_ret']
    book_data['wap1_log_price_ret_neg_log_liq_ret'] = (book_data['log_liquidity1_ret']<0)*book_data['wap1_log_price_ret']


    ids = np.array(book_data[['st_id', 'time_id']]) # single stock and all time_ids and seconda_in_bucket
    ids = np.unique(ids, axis=0)
    book_n_trade_data = {}
    book_n_trade_data['time_id'] = ids[:,1:2]
    
    # bucketized data for book data
    # Amount of absolute wap1 price movements in a time bucket of 30 seconds, i.e. ahsolute wap1 returns volatitlity in bucket
    book_n_trade_data['wap1_log_price_ret_abs_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.abs(np.array(book_data['wap1_log_price_ret'])),
                                    20, 30, ids.shape[0])

    # Amount of absolute wap2 price movements in a time bucket of 30 seconds, i.e.  ahsolute wap2 returns volatitlity in bucket
    book_n_trade_data['wap2_log_price_ret_abs_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.abs(np.array(book_data['wap2_log_price_ret'])),
                                    20, 30, ids.shape[0])    
    
      # wap1 returns variance/ squared volatitlity in bucket
    book_n_trade_data['wap1_log_price_ret_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.array(book_data['wap1_log_price_ret'])**2,
                                    20, 30, ids.shape[0])
    # wap2 returns variance/ squared volatitlity in bucket
    book_n_trade_data['wap2_log_price_ret_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.array(book_data['wap2_log_price_ret'])**2,
                                    20, 30, ids.shape[0])
    
    # squared wap1 returns volatitlity in bucket
    book_n_trade_data['wap1_log_price_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.array(book_data['wap1_log_price_ret'])**2,
                                    20, 30, ids.shape[0])**0.5
    # squared wap2 returns volatitlity in bucket
    book_n_trade_data['wap2_log_price_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.array(book_data['wap2_log_price_ret'])**2,
                                    20, 30, ids.shape[0])**0.5    

    # squared wap2_log_price_ret_changes_n_wap1_log_price_ret_constant volatitlity in bucket
    book_n_trade_data['wap2_logprice_ret_changes_n_wap1_logprice_ret_constant_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                    np.array(book_data['time_id']),
                                    np.array(book_data['wap2_log_price_ret_changes_n_wap1_log_price_ret_constant'])**2,
                                    20, 30, ids.shape[0])**0.5    
    
    # equilibrium price returns absolute volatitlity in bucket
    book_n_trade_data['wap_eqi_price0_ret_abs_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.abs(np.array(book_data['wap_eqi_price0_ret'])),
                                       20, 30, ids.shape[0])

    # squared equilibrium price returns volatitlity in bucket
    book_n_trade_data['wap_eqi_price0_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(book_data['wap_eqi_price0_ret'])**2,
                                       20, 30, ids.shape[0])**0.5
  
    # volatitlity in wap1_log_price_ret when liquidity1 is positive/increases
    book_n_trade_data['wap1_log_price_ret_pos_log_liq_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(book_data['wap1_log_price_ret_pos_log_liq_ret'])**2,
                                       20, 30, ids.shape[0])**0.5
    # volatitlity in wap1_log_price_ret when liquidity1 is negative/decreases    
    book_n_trade_data['wap1_log_price_ret_neg_log_liq_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(book_data['wap1_log_price_ret_neg_log_liq_ret'])**2,
                                       20, 30, ids.shape[0])**0.5

    # squared wap equilibrium price 1 returns volatitlity in bucket
    book_n_trade_data['wap_eqi_price1_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(book_data['wap_eqi_price1_ret'])**2,
                                       20, 30, ids.shape[0])**0.5
    
    # squared wap equilibrium price 1 returns volatitlity in bucket amplified (> 1) by positive/increasing liquidity returns (through exponent)
    # and diminished ( < 1) by negative/decreasing liquidity returns (through exponent)
    book_n_trade_data['exp_liq_*_wap_eqi_price1_ret_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(( np.exp(book_data['log_liquidity1_ret'])*book_data['wap_eqi_price1_ret'])**2 ),
                                       20, 30, ids.shape[0])
    # copy of above
    book_n_trade_data['exp_liq_*_wap_eqi_price1_ret_vol_buks_2'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(( np.exp(book_data['log_liquidity1_ret'])*book_data['wap_eqi_price1_ret'])**2 ),
                                       20, 30, ids.shape[0])
    # variance/ squared volatitliy of wap1 price returns per unit of spread
    # large value indicates volatilty
    book_n_trade_data['svol1'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(( book_data['wap1_log_price_ret']/book_data['spread'])**2 ),
                                       20, 30, ids.shape[0])
    
    # measure of variance/ squared volatility of liquidity1 returns
    book_n_trade_data['log_liquidity1_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(book_data['log_liquidity1_ret'])**2,
                                       20, 30, ids.shape[0])
    
    # measure of variance/ squared volatility of log spread returns
    book_n_trade_data['log_spread_ret_sqr_vol_buks'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                       np.array(book_data['log_spread_ret'])**2,
                                       20, 30, ids.shape[0])    

    # counting number of data points available in each time bucket
    book_n_trade_data['book_delta_count'] = bucketized_summed_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array(book_data['wap1_log_price_ret']*0+1.0),
                                         20, 30, ids.shape[0])

    # time weighted average of wap1_log_price in each time bucket
    book_n_trade_data['wap1_log_price_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.array(book_data['wap1_log_price']),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0])

    # time weighted average of wap2_log_price in each time bucket
    book_n_trade_data['wap2_log_price_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.array(book_data['wap2_log_price']),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0])

    # time weighted average of wap_eqi_price0 equilibrium price in each time bucket
    book_n_trade_data['wap_eqi_price0_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.array(book_data['wap_eqi_price0']),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0])

    # time weighted average of wap_eqi_price1 equilibrium price in each time bucket
    book_n_trade_data['wap_eqi_price1_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.array(book_data['wap_eqi_price1']),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0])


    # filter out the extremely high and low prices of wap1_log_price by amplifying with postiive and negative exponential of wap1_log_price
    # apply time weighted average to the amplified wap1_log_price
    # what may be the physical meaning?
    book_n_trade_data['wap1_log_price_amp_max_wavg'] = np.log( bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.exp( 4000*np.array(book_data['wap1_log_price'])),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0]) )/4000
    book_n_trade_data['wap1_log_price_amp_min_wavg'] = -np.log( bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.exp(-4000*np.array(book_data['wap1_log_price'])),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0]) )/4000
    # amplification of the difference between max and min
    book_n_trade_data['wavg_wap1_log_price_amp_diff']  = np.exp(book_n_trade_data['wap1_log_price_amp_max_wavg'] - book_n_trade_data['wap1_log_price_amp_min_wavg'])

    # filter out the extremely high and low prices of wap_eqi_price0 by amplifying with postiive and negative exponential of wap_eqi_price0
    # apply time weighted average to the amplified wap_eqi_price0
    book_n_trade_data['wap_eqi_price0_amp_max_wavg'] = np.log( bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.exp( 4000*np.array(book_data['wap_eqi_price0'])),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0]) )/4000
    
    book_n_trade_data['wap_eqi_price0_amp_min_wavg'] = -np.log( bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                                   np.exp(-4000*np.array(book_data['wap_eqi_price0'])),
                                                   np.ones((book_data.shape[0])),
                                                   20, 30, ids.shape[0]) )/4000
    # amplification of the difference between max and min
    book_n_trade_data['wavg_wap_eqi_price0_amp_diff']  = np.exp(book_n_trade_data['wap_eqi_price0_amp_max_wavg'] - book_n_trade_data['wap_eqi_price0_amp_min_wavg'])

    del book_n_trade_data['wap1_log_price_amp_max_wavg'], book_n_trade_data['wap1_log_price_amp_min_wavg']
    del book_n_trade_data['wap_eqi_price0_amp_max_wavg'], book_n_trade_data['wap_eqi_price0_amp_min_wavg']

    book_n_trade_data['liquidity1_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array((book_data['liquidity1'])),
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0])

    # time weighted average of spread in each time bucket
    book_n_trade_data['spread_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array((book_data['spread'])),
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0])
    # time weighted average of inverse spread in each time bucket
    book_n_trade_data['inv_spread_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array((book_data['spread']))**-1,
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0])
    # time weighted average of log spread in each time bucket
    book_n_trade_data['log_spread_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.log(np.array((book_data['spread']))),
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0])
    # time weighted average of log spread 2 in each time bucket
    book_n_trade_data['log_spread2'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array((book_data['log_spread2'])),
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0])
    # time weighted average of book size1 in each time bucket
    book_n_trade_data['book_size1'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array((book_data['book_size1'])),
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0]) 
    # time weighted average of book size in each time bucket
    book_n_trade_data['book_size'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                       np.array(book_data['time_id']),
                                         np.array((book_data['book_size'])),
                                         np.ones((book_data.shape[0])),
                                         20, 30, ids.shape[0]) 


    ############################## TRADE DATA ##########################################

    trade_data =  pd.read_parquet(os.path.join(data_dir,'trade_{}.parquet/stock_id={}'.format( dset, st_id)))
    trade_data['trade_volume'] = trade_data['size']*trade_data['price']

    # bucketized trade volume
    book_n_trade_data['trade_volume'] = bucketized_summed_data(np.array(trade_data['seconds_in_bucket']),
                                       np.array(trade_data['time_id']),
                                         np.array(trade_data['trade_volume']),
                                         20, 30, ids.shape[0])
    # bucketized root of trade volume
    book_n_trade_data['root_volume'] = bucketized_summed_data(np.array(trade_data['seconds_in_bucket']),
                                       np.array(trade_data['time_id']),
                                         np.array(trade_data['trade_volume']**.5),
                                         20, 30, ids.shape[0])
    # bucketized cube root of trade volume
    book_n_trade_data['cube_root_volume'] = bucketized_summed_data(np.array(trade_data['seconds_in_bucket']),
                                       np.array(trade_data['time_id']),
                                         np.array(trade_data['trade_volume']**(1/3)),
                                         20, 30, ids.shape[0])

    # bucketized square of cube root of trade volume
    book_n_trade_data['volume_p2/3'] = bucketized_summed_data(np.array(trade_data['seconds_in_bucket']),
                                       np.array(trade_data['time_id']),
                                         np.array(trade_data['trade_volume']**(2/3)),
                                         20, 30, ids.shape[0])
    
    # bucketized quart root of trade volume
    book_n_trade_data['quart_root_volume'] = bucketized_summed_data(np.array(trade_data['seconds_in_bucket']),
                                       np.array(trade_data['time_id']),
                                         np.array(trade_data['trade_volume']**.25),
                                         20, 30, ids.shape[0])
    
    # count the number of trades in each time bucket
    book_n_trade_data['trade_count'] = bucketized_summed_data(np.array(trade_data['seconds_in_bucket']),
                                       np.array(trade_data['time_id']),
                                         np.array(trade_data['trade_volume']**0),
                                         20, 30, ids.shape[0])


    # trade volume per unit of liquidity1
    book_n_trade_data['tvpl1'] = book_n_trade_data['trade_volume']/book_n_trade_data['liquidity1_wavg']
    
    # time weighted average of difference betweeen ask's level 1 and level 2 liquidity
    book_n_trade_data['ask_liq1_diff_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                        np.array(book_data['time_id']),
                                        np.array((book_data['ask_liq1_diff'])),
                                        np.ones((book_data.shape[0])),
                                        20, 30, ids.shape[0])
    # time weighted average of difference betweeen bid's level 1 and level 2 liquidity
    book_n_trade_data['bid_liq1_diff_wavg'] = bucketized_time_weighted_avg_data(np.array(book_data['seconds_in_bucket']),
                                        np.array(book_data['time_id']),
                                        np.array((book_data['bid_liq1_diff'])),
                                        np.ones((book_data.shape[0])),
                                        20, 30, ids.shape[0])
    
    return book_n_trade_data


In [10]:
@jit()
def identify_missing_time_ids(all_time_ids, st_time_ids): # all_time_ids is all unique time_ids from all stocks, st_time_ids is time_ids for particular single stock
    j = 0
    z = 1 == np.zeros(  all_time_ids.shape[0]) # set all unique time_ids to False
    for i in range(st_time_ids.shape[0]):
        while all_time_ids[j] != st_time_ids[i]: # missing time id in the stock
            z[j] = False # set the missing time id index in all unique time ids array z to False
            j = j+1
            if j >= all_time_ids.shape[0]:
                return z
        z[j] = True
        j = j+1
    return z

  @jit()


## BUCKETIZE RAW DATA 
##### BUCKETIZING IRREGULARLY SPACED DATA WITHIN FIRST 10 MINS FOR EACH STOCK TO MAKE THEM COMPARABLE ACROSS STOCKS. CREATE GRANULARITY WITHIN 10 FIRST MINS.
##### This is a way to standardize data arriving at irreguar seconds 

In [11]:
def create_dataSet(st_ids,dset):
    
    st_ids = sorted(st_ids)
    
    print('st_ids',st_ids)

    # a list contains all stock data each element of list is a dictionary of features for a particular stock
    all_stock_data = Parallel(n_jobs = os.cpu_count() - 5)( delayed(create_stock_data)(st_id, dset) for st_id in st_ids)

    final_data = {}
    
    # get all unique time ids from all stocks. This is helpful to fill missing time ids.
    t_ids = sum([list(ss['time_id']) for ss in all_stock_data], [] )
    t_ids = list(np.unique(t_ids))
    
    num_buks = 30
    t_ids_size = len(t_ids)
    st_ids_size = len(st_ids)
    
    final_data['time_ids' ] = np.array(t_ids)
    final_data['stock_ids'] = np.array(st_ids)
    

    for key in all_stock_data[0].keys(): # common columns (features) to all stocks
        if key == 'time_id':
            continue
        
        Z = np.zeros(( t_ids_size, st_ids_size, num_buks))
        
        for st in range(st_ids_size):
            ss = all_stock_data[st]

            #ts = index_into_set(np.array(time_ids), ss['time_id']).astype(int)

            b = identify_missing_time_ids(np.array(t_ids), ss['time_id']) # all unique time ids from all stocks and time ids of a particular stock are input

            #print(b)
            #print(b.shape)
            
            Z[ b, st, :] = ss[key] # fill with features for avaialble time ids
            
            Z[~b, st, :] = np.nanmean(ss[key]) # fill with mean of features for missing time ids
            
            Z[:,st,:][np.isnan(Z[:,st,:])] = np.nanmean(Z[:,st,:]) # fill with mean of features for missing time ids and any missing bins
                                              
            #del ss[key]
        
        final_data[key] = Z
        
        gc.collect()
        
        
    del all_stock_data
    gc.collect()
    

    # arbitrarily weighted average of wap1_log_price_ret_abs_vol_buks and wap2_log_price_ret_abs_vol_buks
    final_data['wap1_log_price_ret_vol_buks'] = ( final_data['wap1_log_price_ret_vol_buks']**2 + .25*final_data['wap2_log_price_ret_vol_buks']**2)**0.5

    return final_data    

In [12]:
os.chdir('/home/optimusprime/Desktop/peeterson/optiver/Optiver-Realized-Volatility-Prediction/data/liquidity_features')

# train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
# train_buckets = create_dataSet(st_ids = list(np.unique(train['stock_id'])), dset = 'train')
# with open('train_buckets.pkl', 'wb') as fp:
#     pickle.dump(train_buckets, fp)
#     print('saved train_buckets.pkl successfully')

with open('train_buckets.pkl', 'rb') as fp:
    train_buckets = pickle.load(fp)
    print('train_buckets.pkl file')
train_buckets # shape of (time_id=3830,stock_id= 112, bins=30)

st_ids [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 50, 51, 52, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 118, 119, 120, 122, 123, 124, 125, 126]




saved train_buckets.pkl successfully
train_buckets.pkl file


{'time_ids': array([    5,    11,    16, ..., 32758, 32763, 32767]),
 'stock_ids': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  26,  27,  28,
         29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  46,  47,  48,  50,  51,  52,  53,  55,  56,  58,
         59,  60,  61,  62,  63,  64,  66,  67,  68,  69,  70,  72,  73,
         74,  75,  76,  77,  78,  80,  81,  82,  83,  84,  85,  86,  87,
         88,  89,  90,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        118, 119, 120, 122, 123, 124, 125, 126]),
 'wap1_log_price_ret_abs_vol_buks': array([[[3.98205793e-05, 1.22545953e-03, 2.66341796e-03, ...,
          2.15556395e-03, 1.35412600e-03, 5.92733936e-04],
         [4.22617956e-03, 2.29907571e-03, 2.61343014e-03, ...,
          2.68443045e-03, 3.03103658e-03, 2.65585445e-03],
     

# CLUSTERING ANALYSIS