# Features from the paper

`Easley, D., López de Prado, M., O’Hara, M., Zhang, Z., 2020. Microstructure in the Machine Age. Rev. Financ. Stud. 34, 3316–3363`

In [76]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [77]:
# import some packages
import numpy as np
import pandas as pd
import os
from pathlib import Path

home = str(Path.home())

In [78]:
# some old functions:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [79]:
# Load the data
data_folder = os.path.join(home, 'data', 'optiver-realized-volatility-prediction')

# for the prototype load just one stock
df_book = pd.read_parquet(os.path.join(data_folder, 'book_train.parquet', 'stock_id=0'))
df_trades = pd.read_parquet(os.path.join(data_folder, 'trade_train.parquet', 'stock_id=0'))
# df_trades

In [111]:
df_trades

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,21,1.002301,326,12
1,5,46,1.002778,128,4
2,5,50,1.002818,55,1
3,5,57,1.003155,121,5
4,5,68,1.003646,4,1
...,...,...,...,...,...
123438,32767,471,0.998659,200,3
123439,32767,517,0.998515,90,1
123440,32767,523,0.998563,1,1
123441,32767,542,0.998803,90,4


## Features

### 0. Some simple stats

In [53]:
# quoted spread
df_book['spread'] = df_book['ask_price1'] - df_book['bid_price1']

# depth imbalance
df_book['mid_price'] = (df_book['ask_price1'] + df_book['bid_price1'])/2 # we need the midprice to calculate the imbalance

df_book['depth_imbalance'] = (df_book['bid_size1']/(df_book['mid_price']-df_book['bid_price1']) + \
                              df_book['bid_size2']/(df_book['mid_price']-df_book['bid_price2'])) / \
                             (df_book['ask_size1']/(-df_book['mid_price']+df_book['ask_price1']) + \
                              df_book['ask_size2']/(-df_book['mid_price']+df_book['ask_price2']))   


In [62]:
df_book.columns

Index(['time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1',
       'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2',
       'ask_size2', 'spread', 'mid_price', 'depth_imbalance'],
      dtype='object')

### 1. Roll measure

$R_\tau = 2\sqrt{\text{Cov} (\Delta P_\tau, \Delta P_{\tau-1})}$

### 2. Roll Impact
$\tilde{R}_\tau = \frac{R_\tau}{p_\tau V_\tau}$

### 3. Kyle's lambda (Market Impact)

$\lambda = \frac{| \Delta \text{Price}_t |}{\text{Volume}_t}$

### 4. Amihud ratio

$I_t = \frac{| r_t |}{\text{Volume}_t}$

In [24]:
df_measures = df_trades.copy()
df_measures['log_return'] = df_measures.groupby('time_id')['price'].apply(log_return)
df_measures['d_price'] = df_measures.groupby(['time_id'])['price'].diff()
df_measures['d_price_l1'] = df_measures.groupby(['time_id'])['d_price'].shift(1)

grp = df_measures.groupby('time_id')
df_measures = pd.DataFrame()
df_measures['covariance'] = grp.apply(lambda x: x['d_price'].cov(x['d_price_l1']))

# Roll measure
df_measures['roll_measure'] = 2 * np.sqrt(np.abs(df_measures.covariance))
df_measures.drop(columns = ['covariance'], inplace=True)

# Roll impact
df_measures['roll_impact'] = df_measures['roll_measure'] / grp.apply(lambda x: np.sum(x['price'] * x['size']))

# Kyle's lambda
df_measures['mkt_impact'] = grp.apply(lambda x: np.sum(np.abs(x['d_price'])) / np.sum(x['size']))

# Amihud ratio
df_measures['amihud'] = grp.apply(lambda x: np.abs(np.sum(x['log_return'])) / np.sum(x['size']))

# df_measures.roll_measure.describe()

df_measures

  return np.cov(a, b, ddof=ddof)[0, 1]
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,roll_measure,roll_impact,mkt_impact,sum_return
time_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,0.000204,6.382619e-08,0.000003,4.216972e-07
11,0.000206,1.597455e-07,0.000003,6.228642e-07
16,0.000457,2.118416e-07,0.000004,1.175238e-06
31,0.000516,2.630821e-07,0.000002,1.152450e-06
62,0.000065,3.642724e-08,0.000002,1.531390e-07
...,...,...,...,...
32751,0.000207,6.012802e-08,0.000002,4.437514e-07
32753,0.000239,5.259393e-08,0.000001,1.871173e-07
32758,0.000248,5.843691e-08,0.000002,3.439502e-07
32763,0.000186,5.758722e-08,0.000003,1.342682e-07


# Functions

In [103]:
import sys

def fin_metrics_book_data(df):
    
    if 'bid_price1' not in df.columns:
        sys.exit("Book data format requred")
    
    df = df.copy()
    
    # compute time length
    df['time_length'] = df['seconds_in_bucket'].diff()
    df.time_length = df.time_length.shift(periods=-1)
    df.loc[len(df)-1,'time_length'] = 600 - df['seconds_in_bucket'].iloc[-1]
    
    # quoted spread
    df['spread'] = df['ask_price1'] - df['bid_price1']

    # depth imbalance
    df['mid_price'] = (df['ask_price1'] + df['bid_price1'])/2 # we need the midprice to calculate the imbalance

    df['depth_imbalance'] = (df['bid_size1']/(df['mid_price']-df['bid_price1']) + \
                                  df['bid_size2']/(df['mid_price']-df['bid_price2'])) / \
                                 (df['ask_size1']/(-df['mid_price']+df['ask_price1']) + \
                                  df['ask_size2']/(-df['mid_price']+df['ask_price2'])) 

    # Compute the weighted averages
    spread = np.sum(df['spread'] * df['time_length']) / 600
    depth_imb = np.sum(df['depth_imbalance'] * df['time_length']) / 600
    
    return [spread, depth_imb]

In [108]:
fin_metrics_book_data(df_book[df_book.time_id == 5])

[0.0008681410551071167, 6.917597782193109]

In [125]:
def fin_metrics_trades_data(df):
    
    if 'order_count' not in df.columns:
        sys.exit("Trades data format requred")
        
    df = df.copy()
    
    # compute neccessary cols
    df['log_return'] = log_return(df['price'])
    df['d_price']    = df['price'].diff()
    df['d_price_l1'] = df['d_price'].shift(1)
    
    # compute metrics
    roll_measure = 2 * np.sqrt(np.abs(df['d_price'].cov(df['d_price_l1'])))
    roll_impact = roll_measure / (np.sum(df['price'] * df['size']))
    mkt_impact = np.sum(np.abs(df['d_price'])) / np.sum(df['size'])
    amihud = np.abs(np.sum(df['log_return'])) / np.sum(df['size'])
    
    return [roll_measure, roll_impact, mkt_impact, amihud]

In [126]:
fin_metrics_trades_data(df_trades[df_trades.time_id==5])

[0.00020361441923319408,
 6.382618677401788e-08,
 3.3940710935925642e-06,
 4.2169716369715756e-07]