# Работа с данными из книги ордеров: NASDAQ ITCH

Основным источником рыночных данных является книга заказов, которая постоянно обновляется в режиме реального времени в течение дня, чтобы отражать всю торговую активность. Биржи обычно предлагают эти данные в режиме реального времени и могут предоставлять некоторые исторические данные бесплатно.

Торговая активность отражается в многочисленных сообщениях о торговых приказах, отправленных участниками рынка. Эти сообщения обычно соответствуют протоколу обмена электронной финансовой информацией (FIX) для обмена транзакциями с ценными бумагами и рыночными данными в реальном времени или собственному протоколу обмена.

In [None]:
from pathlib import Path
from collections import Counter
from datetime import timedelta
from datetime import datetime
from time import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

In [2]:
sns.set_style('whitegrid')

In [3]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'

In [4]:
data_path = Path('data')
itch_store = str(data_path / 'itch.h5')
order_book_store = data_path / 'order_book.h5'
date = '10302019'

In [5]:
stock = 'AAPL'
order_dict = {-1: 'sell', 1: 'buy'}

In [6]:
def get_messages(date, stock=stock):
    with pd.HDFStore(itch_store) as store:
        stock_locate = store.select('R', where='stock = stock').stock_locate.iloc[0]
        target = 'stock_locate = stock_locate'
        
        data = {}
        messages = ['A', 'F', 'E', 'C', 'X', 'D', 'U', 'P', 'Q']
        for m in messages:
            data[m] = store.select(m, where=target).drop('stock_locate', axis=1).assign(type=m)
    order_cols = ['order_reference_number', 'buy_sell_indicator', 'shares', 'price']
    orders = pd.concat([data['A'], data['F']], sort=False, ignore_index=True).loc[:, order_cols]
    for m in messages[2: -3]:
        data[m] = data[m].merge(orders, how='left')
    data['U'] = data['U'].merge(orders, how='left',
                               right_on='order_reference_number', 
                               left_on='original_order_reference_number')
    data['Q'].rename(columns={'cross_price': 'price'}, inplace=True)
    data['X']['shares'] = data['X']['cancelled_shares']
    data['X'] = data['X'].dropna(subset=['price'])

    data = pd.concat([data[m] for m in messages], ignore_index=True, sort=False)
    data['date'] = pd.to_datetime(date, format='%m%d%Y')
    data.timestamp = data['date'].add(data.timestamp)
    data = data[data.printable != 0]

    drop_cols = ['tracking_number', 'order_reference_number', 'original_order_reference_number',
                 'cross_type', 'new_order_reference_number', 'attribution', 'match_number',
                 'printable', 'date', 'cancelled_shares']
    return data.drop(drop_cols, axis=1).sort_values('timestamp').reset_index(drop=True)

In [11]:
messages = get_messages(date)
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061513 entries, 0 to 1061512
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   timestamp           1061513 non-null  datetime64[ns]
 1   buy_sell_indicator  993354 non-null   float64       
 2   shares              984974 non-null   float64       
 3   price               984974 non-null   float64       
 4   type                1061513 non-null  object        
 5   executed_shares     81642 non-null    float64       
 6   execution_price     714 non-null      float64       
 7   shares_x            70141 non-null    float64       
 8   price_x             70141 non-null    float64       
 9   shares_y            8383 non-null     float64       
 10  price_y             8383 non-null     float64       
dtypes: datetime64[ns](1), float64(9), object(1)
memory usage: 89.1+ MB


In [None]:
with pd.HDFStore(order_book_store) as store:
    key = f'{stock}/messages'
    store.put(key, messages)
    print(store.info())

In [7]:
def get_trades(m):
    trade_dict = {'executed_shares': 'shares', 'execution_price': 'price'}
    cols = ['timestamp', 'executed_shares']
    trades = pd.concat([m.loc[m.type == 'E', cols + ['price']].rename(columns=trade_dict),
                        m.loc[m.type == 'C', cols + ['execution_price']].rename(columns=trade_dict), 
                        m.loc[m.type == 'P', ['timestamp', 'price', 'shares']],
                        m.loc[m.type == 'Q', ['timestamp', 'price', 'shares']].assign(cross=1), 
                        ], sort=False).dropna(sunset=['price']).fillna(0)
    return trades.set_index('timestamp').sort_index().astype(int)

In [None]:
trades = get_trades(messages)
trades.info()

In [None]:
with pd.HDFStore(order_book_store) as store:
    store.put(f'{stock}/trades', trades)

In [13]:
def add_orders(orders, buysell, nlevels):
    new_order = []
    items = sorted(orders.copy().items())
    if buysell == 1:
        items = reversed(items)
    for i, (p, s) in enumerate(items, 1):
        new_order.append((p, s))
        if i == nlevels:
            break

In [None]:
def save_orders(orders, append=False):
    cols = ['price', 'shapes']
    for buysell, book in orders.items():
        df = (pd.concat([pd.DataFrame(data=data, 
                                      columns=cols)
                         .assing(timestamp=t) 
                         for t, data in book.items()]))
        key = f'{stock}/{order_dict[buysell]}'
        df.loc[:, ['price', 'shapes']] = df.loc[:, ['price', 'shapes']].astype(int)
        with pd.HDFStore(order_book_store) as store:
            if append:
                store.append(key, df.set_index('timestamp'), format='t')
            else:
                store.put(key, df.set_index('timestamp'))

In [None]:
order_book = {-1: {}, 1: {}}
current_orders = {-1: Counter(), 1: Counter()}
message_counter = Counter()
nlevels = 100

start = time()
for message in messages.itertuples():
    i = message[0]
    if i % 1e5 == 0 & i > 0:
        print(f'{i:,.0f}\t\t{format_time(time() - start)}')
        save_orders(order_book, append=True)
        order_book = {-1: {}, 1:{}}
        start = time()
        if np.isna(message.buy_sell_indicator):
            continue
        message_counter.update(message.type)
        
        buysell = message.buy_sell_indicator
        price, shares = None, None
        if message.type in ['A', 'F', 'U']:
            price = int(message.price)
            shares = int(message.shares)
            
            current_orders[buysell].update({price: shares})
            current_orders[buysell], new_order = add_orders(current_orders[buysell], buysell, nlevels)
            order_book[buysell][message.timestamp] = new_order
        
        if message.type in ['E', 'C', 'X', 'D', 'U']:
            if message.type == 'U':
                if not np.isnan(message.shares_replaced):
                    price = int(message.price_replaced)
                    shares = -int(message.shares_replaced)
            else:
                if not np.isnan(message.price):
                    price = int(message.price)
                    shares = -int(message.shares)

            if price is not None:
                current_orders[buysell].update({price: shares})
                if current_orders[buysell][price] <= 0:
                    current_orders[buysell].pop(price)
                current_orders[buysell], new_order = add_orders(current_orders[buysell], buysell, nlevels)
                order_book[buysell][message.timestamp] = new_order

In [None]:
message_counter = pd.Series(message_counter)
print(message_counter)

In [None]:
with pd.HDFStore(order_book_store) as store:
    print(store.info())
    buy = store[f'{stock}/buy'].reset_index().drop_duplicates()
    sell = store[f'{stock}/sell'].reset_index().drop_duplicates()

In [None]:
buy.price = buy.price.mul(1e-4)
sell.price = sell.price.mul(1e-4)

In [None]:
percentiles = [.01, .02, .1, .25, .75, .9, .98, .99]
pd.concat([buy.price.describe(percentiles=percentiles).to_frame('buy'), 
           sell.price.describe(percentiles=percentiles).to_frame('sell')], axis=1)

In [None]:
buy = buy[buy.price > buy.price.quantile(.01)]
sell = sell[sell.price > sell.price.quantile(.01)]

In [8]:
market_open = '0930'
market_close = '1600'

In [None]:
fig, ax = plt.subplot(figsize=(7, 5))
hist_kws = {'linewidth': 1, 'alpha': .5}
sns.distplot(buy[buy.price.between(240, 250)].set_index('timestamp').between_time(market_open, market_close).price,
            ax=ax, label='Buy', kde=False, hist_kws=hist_kws)
sns.distplot(sell[sell.price.between(240, 250)].set_index('timestamp').between_time(market_open, market_close).price,
            ax=ax, label='Sell', kde=False, hist_kws=hist_kws)

ax.legend(fontsize=10)
ax.set_title('Распределение цен лимитных ордеров')
ax.set_yticklabels([f'{int(y/1000):,}' for y in ax.get_yticks().tolist()])
ax.set_xticklabels([f'${int(x):,}' for x in ax.get_xticks().tolist()])
ax.set_xlabel('Цена')
ax.set_ylabel('Количество акций (\'000)')
sns.despine()
fig.tight_layout();

### Глубина книги ордеров

In [10]:
utc_offset = timedelta(hours=4)
depth = 100