## The Hit Rate Curve Model

The hit rate curve model is essential in market making. It quantifies the probability of getting traded given the assets, quoted price and etc. In this notebook, we'll calibrate a simple hit-rate curve model by using logistic regression.

In [2]:
import pandas as pd 

orderBook = pd.read_csv("../data/lakeAPIData/orderbook/BTC-USDT_Jan2024.csv.gzip", compression="gzip")
tradeRecords = pd.read_csv("../data/lakeAPIData/trades/BTC-USDT_Jan2024.csv.gzip", compression="gzip")

In [3]:


def orderBookFeatures(orderBook:pd.DataFrame)->pd.DataFrame:
    
    orderBook['bidOffer'] = orderBook['ask_0_price'] - orderBook['bid_0_price']
    orderBook['midPrice'] = (orderBook['ask_0_price'] + orderBook['bid_0_price'])/2.0
    for i in range(20):
        orderBook['ask_' + str(i) + '_Bo'] = (orderBook['ask_' + str(i) + '_price'] - orderBook['midPrice'])/orderBook['bidOffer']
        orderBook['bid_' + str(i) + '_Bo'] = (-orderBook['bid_' + str(i) + '_price'] + orderBook['midPrice'])/orderBook['bidOffer']
        orderBook = orderBook.drop(columns=['bid_' + str(i) + '_price','ask_' + str(i) + '_price'])
    return orderBook

def tradeFeatures(tradeRecords:pd.DataFrame)->pd.DataFrame:
    tradeRecords = tradeRecords[['side','quantity','price','received_time','symbol']]
    tradeRecords.rename(columns = {'quantity':'tradeSize','price':'tradePrice'}, inplace = True)
    return tradeRecords


ordb = orderBookFeatures(orderBook).convert_dtypes().sort_values(by='received_time')
trd = tradeFeatures(tradeRecords).convert_dtypes().sort_values(by='received_time')
ordb['received_time'] = pd.to_datetime(ord['received_time'] )
trd['received_time'] = pd.to_datetime(trd['received_time'] )
completeBook = pd.merge_asof(ordb, trd, on="received_time")
completeBook['tradedBo'] = completeBook.apply(lambda x : (x['tradePrice'] - x['midPrice'])/x['bidOffer'] if x['side'] == 'buy' else (x['midPrice'] - x['tradePrice'])/x['bidOffer'],axis = 1 )
completeBook['tradedBo'] = completeBook.apply(lambda x : (x['tradePrice'] - x['midPrice'])/x['bidOffer'] if x['side'] == 'buy' else (x['midPrice'] - x['tradePrice'])/x['bidOffer'],axis = 1 )
completeBook = completeBook[completeBook['tradedBo'] > 0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tradeRecords.rename(columns = {'quantity':'tradeSize','price':'tradePrice'}, inplace = True)


In [None]:
def generateDataSet(orderBook:pd.DataFrame)->pd.DataFrame:
    data = {'side':[], 'distance':[], 'traded':[], 'symbol':[], 'size':[]}
    symb = orderBook['symbol'].values[0]
    side = orderBook['side'].values[0]
    size = orderBook['tradeSize'].values[0]
    tradedBo = orderBook['tradedBo'].values[0]
    data['side'] += ['B' if side == 'sell' else 'S']
    data['symbol'] += [symb]
    data['distance'] += [tradedBo]
    data['traded'] += [1]
    data['size'] += [size]
    
    for i in range(3):
        distance = orderBook['bid_{}_Bo'.format(i)].values[0]
        size = orderBook['bid_{}_size'.format(i)].values[0]
        data['side'] += ['B']
        data['symbol'] += [symb]
        data['distance'] += [distance]
        data['traded'] += [0]
        data['size'] += [size]
    
    data = pd.DataFrame(data)
    return data 

dataSet = completeBook.apply(lambda x : generateDataSet(x), axis = 1)


In [33]:
data = completeBook[['tradedBo','tradeSize','side']].rename(columns = {'tradedBo':'dt', 'tradeSize':'size'})
data['side'] = data['side'].apply(lambda x: 'B' if x == 'sell' else 'S')
data['traded'] = True
for i in range(1):
    sampleB = completeBook[['bid_{}_size'.format(i),'bid_{}_Bo'.format(i)]].rename(columns = {'bid_{}_size'.format(i):'size', 'bid_{}_Bo'.format(i):'dt'})
    sampleB['traded'] = False
    sampleB['side'] = 'B'
    
    sampleS = completeBook[['ask_{}_size'.format(i),'ask_{}_Bo'.format(i)]].rename(columns = {'ask_{}_size'.format(i):'size', 'ask_{}_Bo'.format(i):'dt'})
    sampleS['traded'] = False
    sampleS['side'] = 'S'
    
    data = pd.concat([sampleB,sampleS,data])

data['mlt'] = 1
data.loc[data['side'] == 'B', 'mlt'] = -1
data['dt'] *= data['mlt']


In [41]:
from sklearn.linear_model import LogisticRegression

X = data[['size','dt']].values
y = data[['traded']].values

In [42]:
clf = LogisticRegression(random_state=0).fit(X, y)

  y = column_or_1d(y, warn=True)


In [48]:
r = clf.predict_proba(X)

In [60]:
import numpy as np

np.sort(r[:,1])

array([0.        , 0.        , 0.        , ..., 0.94021493, 0.9404629 ,
       0.94473573])