In [2]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import vaex
import dask
#from utils import compute_trade_sign

import glob
from dask.distributed import Client
import time


def compute_trade_sign(events:pd.DataFrame):
    """ Computes the sign of a trade for each trade in a intraday trade dataframe. The sign of a trade represents whether the trade was buy-initiated or sell-initiated.
        Trade sign is defined as: 
        1. if trade price above mid-price => buy
        2. if trade price below mid-price => sell
        3. if trade price equal to mid-price => apply tick-test

    Args:
        events (pd.DataFrame): Intraday trade data, assumes time-ordering of the trades

    Returns:
        _type_: intraday trade data with additional column "s", representing the sign of the trade
    """

 
    events["mid"] = (events["bid-price"] + events["ask-price"]) * 0.5
    events = events.fillna(method="ffill").dropna()
    events["s"] = events["trade_price"] - events["mid"]
    #print(events["s"])
    events["s"] = np.sign(events["s"])

    print(
        "Percentage of unclassifiable trades",
        f"{((events.s == 0.0).sum()/ len(events))*100:.2f}%",
    )

    ## we need to resolve case where trade_price = mid_price (by using tick test described in the paper) following Lee's algo https://onlinelibrary.wiley.com/doi/full/10.1111/j.1540-6261.1991.tb02683.x

    # The tick test is a technique which infers the direction of a trade by-comparing its price to the price of the preceding trade(s).
    # The test classifies each trade into four categories: an uptick, a downtick, a zero-uptick, and a zero-downtick.
    # A trade is an uptick (downtick) if the price is higher (lower) than the price of the previous trade. When the price is the same as the previous trade (a zero tick),
    # if the last price change was an uptick, then the trade is a zero-uptick.
    # Similarly, if the last price change was a downtick, then the trade is a zero-downtick.
    # A trade is classified as a buy if it occurs on an uptick or a zero-uptick; otherwise it is classified as a sell.

    uptick = pd.Series(
        (events["trade_price"].shift(-1) - events["trade_price"]).iloc[:-1].values,
        index=events.iloc[1:].index,
    )

    ## important to set nan first, since False == 0.0 in pandas
    uptick[uptick == 0.0] = np.nan
    uptick[uptick > 0.0] = True
    uptick[uptick < 0.0] = False

    ## now that we have clbasassified the upticks, if uptick = True and s=0.0 => it is a buy-trade, if uptick=False and s=0.0 => it is a sell-trade, if uptick = NaN and s=0.0 => take last trade classification

    ## use ffill to take last trade classification
    events["uptick"] = uptick.ffill()

    ## applying the rule described above
    idx = events[(events.s == 0.0)].index
    events["new_s"] = 2*(events.loc[idx]["uptick"].astype(int))-1
    events["new_s"] = events["new_s"].fillna(0.0)
    events["s"] = events["s"] + events["new_s"]

    ## cleaning up after
    events.drop(columns=["uptick", "new_s"], inplace=True)

    return events

@dask.delayed
def load_and_compute_trade_sign(path, save = False):
    print("Processing",path)
    df = vaex.open(path).to_pandas_df()
    df = compute_trade_sign(df)
    if save:
        path = path.replace("events","events_w_s")
        print("Saving to ", path)
        #df.to_pickle(path)
        df_v=vaex.from_pandas(df)
        #print(df_v.schema)
        df_v.export_arrow(path)

if __name__ == "__main__":

    client = Client(n_workers = 1, threads_per_worker=4)

    client.amm.start()

    datasets = glob.glob("../data/clean/DOW/*events.arrow")

    print(len(datasets))
    t1 = time.time()
    print("Computing trade sign of", len(datasets), "datasets")
    all_promises=[]
    for dataset in datasets:
        all_promises.append(load_and_compute_trade_sign(dataset,True))
    dask.compute(all_promises, optimize_graph=False)
    t2 = time.time()
    print("Computation took", (t2-t1), "seconds")

0
Computing trade sign of 0 datasets
Computation took 4.696846008300781e-05 seconds


In [8]:
datasets = glob.glob("../data/clean/DOW/*events.arrow")

In [12]:
path = datasets[0]

In [13]:
df = vaex.open(path).to_pandas_df()

In [17]:
df

Unnamed: 0,trade_price,trade_volume,bid-price,bid-volume,ask-price,ask-volume,index
0,56.580000,100.0,,,,,2010-01-04 14:30:00.970999552
1,56.566667,679.0,56.56,8.0,56.86,2.0,2010-01-04 14:30:01.020001024
2,56.565000,300.0,,,,,2010-01-04 14:30:01.032999936
3,56.570000,345.0,56.56,5.0,56.86,2.0,2010-01-04 14:30:01.248999680
4,56.575000,200.0,56.56,5.0,56.86,2.0,2010-01-04 14:30:01.273000192
...,...,...,...,...,...,...,...
1656405,54.915000,1800.0,54.90,242.0,54.92,30.0,2010-12-31 20:59:59.797999360
1656406,54.920000,100.0,54.90,242.0,54.92,29.0,2010-12-31 20:59:59.900999424
1656407,54.900000,500.0,54.90,207.0,54.94,14.0,2010-12-31 20:59:59.965999872
1656408,54.900000,1100.0,,,,,2010-12-31 20:59:59.970999552


In [19]:
events =df

In [21]:
events["mid"] = (events["bid-price"] + events["ask-price"]) * 0.5
events = events.fillna(method="ffill").dropna()
events["s"] = events["trade_price"] - events["mid"]
#print(events["s"])
events["s"] = np.sign(events["s"])

In [30]:
events[(events.s==0)]

Unnamed: 0,trade_price,trade_volume,bid-price,bid-volume,ask-price,ask-volume,index,mid,s
28,56.780,100.0,56.75,2.0,56.81,2.0,2010-01-04 14:30:41.065999872,56.780,0.0
72,56.970,100.0,56.94,2.0,57.00,6.0,2010-01-04 14:32:02.780999936,56.970,0.0
73,57.000,200.0,56.94,6.0,57.06,2.0,2010-01-04 14:32:06.056000512,57.000,0.0
74,57.000,200.0,56.94,7.0,57.06,2.0,2010-01-04 14:32:06.064999424,57.000,0.0
116,56.940,100.0,56.93,1.0,56.95,1.0,2010-01-04 14:33:18.514000384,56.940,0.0
...,...,...,...,...,...,...,...,...,...
1656345,54.930,363.0,54.92,85.0,54.94,27.0,2010-12-31 20:59:47.103000064,54.930,0.0
1656346,54.930,200.0,54.92,85.0,54.94,27.0,2010-12-31 20:59:47.197999872,54.930,0.0
1656388,54.925,1100.0,54.92,29.0,54.93,87.0,2010-12-31 20:59:57.360999936,54.925,0.0
1656389,54.920,990.0,54.91,201.0,54.93,17.0,2010-12-31 20:59:57.589999872,54.920,0.0


In [32]:
events.head(50)

Unnamed: 0,trade_price,trade_volume,bid-price,bid-volume,ask-price,ask-volume,index,mid,s
1,56.566667,679.0,56.56,8.0,56.86,2.0,2010-01-04 14:30:01.020001024,56.71,-1.0
2,56.565,300.0,56.56,8.0,56.86,2.0,2010-01-04 14:30:01.032999936,56.71,-1.0
3,56.57,345.0,56.56,5.0,56.86,2.0,2010-01-04 14:30:01.248999680,56.71,-1.0
4,56.575,200.0,56.56,5.0,56.86,2.0,2010-01-04 14:30:01.273000192,56.71,-1.0
5,56.57,100.0,56.56,5.0,56.86,2.0,2010-01-04 14:30:01.347000832,56.71,-1.0
6,56.57,100.0,56.56,4.0,56.75,1.0,2010-01-04 14:30:01.455000320,56.655,-1.0
7,56.58,100.0,56.56,4.0,56.75,1.0,2010-01-04 14:30:01.544999936,56.655,-1.0
8,56.645,200.0,56.56,4.0,56.75,1.0,2010-01-04 14:30:01.558999808,56.655,-1.0
9,56.74,600.0,56.56,4.0,56.75,1.0,2010-01-04 14:30:01.665000192,56.655,1.0
10,56.725,272.0,56.56,4.0,56.75,1.0,2010-01-04 14:30:01.803000064,56.655,1.0


In [33]:
uptick = pd.Series(
    (events["trade_price"].shift(-1) - events["trade_price"]).iloc[:-1].values,
    index=events.iloc[1:].index,
)

## important to set nan first, since False == 0.0 in pandas
uptick[uptick == 0.0] = np.nan
uptick[uptick > 0.0] = True
uptick[uptick < 0.0] = False


In [35]:
uptick.head(50)

2     False
3      True
4      True
5     False
6       NaN
7      True
8      True
9      True
10    False
11    False
12     True
13      NaN
14     True
15    False
16    False
17     True
18     True
19      NaN
20     True
21    False
22      NaN
23      NaN
24     True
25     True
26     True
27     True
28     True
29     True
30    False
31     True
32     True
33      NaN
34     True
35     True
36    False
37     True
38    False
39     True
40    False
41      NaN
42     True
43      NaN
44      NaN
45    False
46      NaN
47      NaN
48      NaN
49      NaN
50     True
51      NaN
dtype: object

In [39]:
events["uptick"] = uptick.ffill()


In [40]:
idx = events[(events.s == 0.0)].index
events["new_s"] = 2*(events.loc[idx]["uptick"].astype(int))-1


Unnamed: 0,trade_price,trade_volume,bid-price,bid-volume,ask-price,ask-volume,index,mid,s,uptick,new_s
200,57.03,100.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:51.093999872,57.035,-1.0,False,0.0
201,57.05,907.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:51.159000320,57.035,1.0,True,0.0
202,57.05,882.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:51.642000384,57.035,1.0,True,0.0
203,57.05,100.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:53.167999744,57.035,1.0,True,0.0
204,57.05,100.0,57.03,3.0,57.05,20.0,2010-01-04 14:35:53.985000192,57.04,1.0,True,0.0
205,57.05,1000.0,57.03,3.0,57.05,20.0,2010-01-04 14:35:53.993999872,57.04,1.0,False,0.0
206,57.05,1000.0,57.03,4.0,57.05,2.0,2010-01-04 14:35:54.005000960,57.04,1.0,True,0.0
207,57.04,338.0,57.03,1.0,57.05,2.0,2010-01-04 14:35:54.842000384,57.04,0.0,False,0.0
208,57.05,211.0,57.03,1.0,57.06,2.0,2010-01-04 14:35:55.197000192,57.045,1.0,True,0.0
209,57.06,100.0,57.03,1.0,57.06,2.0,2010-01-04 14:35:56.232000000,57.045,1.0,True,0.0


In [44]:
events["new_s"] = events["new_s"].fillna(0.0)


In [50]:
oui = events.apply(lambda x : x.s + x.new_s, axis=1)

In [62]:
oui[oui==0]

207        0.0
215        0.0
237        0.0
345        0.0
346        0.0
          ... 
1656292    0.0
1656314    0.0
1656388    0.0
1656389    0.0
1656391    0.0
Length: 134649, dtype: float64

In [63]:
events.loc[200:220]

Unnamed: 0,trade_price,trade_volume,bid-price,bid-volume,ask-price,ask-volume,index,mid,s,uptick,new_s
200,57.03,100.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:51.093999872,57.035,-1.0,False,0.0
201,57.05,907.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:51.159000320,57.035,1.0,True,0.0
202,57.05,882.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:51.642000384,57.035,1.0,True,0.0
203,57.05,100.0,57.02,1.0,57.05,44.0,2010-01-04 14:35:53.167999744,57.035,1.0,True,0.0
204,57.05,100.0,57.03,3.0,57.05,20.0,2010-01-04 14:35:53.985000192,57.04,1.0,True,0.0
205,57.05,1000.0,57.03,3.0,57.05,20.0,2010-01-04 14:35:53.993999872,57.04,1.0,False,0.0
206,57.05,1000.0,57.03,4.0,57.05,2.0,2010-01-04 14:35:54.005000960,57.04,1.0,True,0.0
207,57.04,338.0,57.03,1.0,57.05,2.0,2010-01-04 14:35:54.842000384,57.04,0.0,False,0.0
208,57.05,211.0,57.03,1.0,57.06,2.0,2010-01-04 14:35:55.197000192,57.045,1.0,True,0.0
209,57.06,100.0,57.03,1.0,57.06,2.0,2010-01-04 14:35:56.232000000,57.045,1.0,True,0.0


1