<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/3.0-mb-classical_rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install gcsfs==2022.10.0
!pip install numpy==1.23.4
!pip install pandas==1.5.1
!pip install fastparquet
!pip install scikit-learn==1.1.3
!pip install typing_extensions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement numpy==1.23.4 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0rc1, 1.13.0rc2, 1.13.0, 1.13.1, 1.13.3, 1.14.0rc1, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0rc1, 1.15.0rc2, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0rc1, 1.16.0rc2, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0rc1, 1.17.0rc2, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0rc1, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0rc1, 1.19.0rc2, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5, 1.20.0rc1, 1.20.0rc2, 1.20.0, 1.20.1, 1.20.2, 1

In [26]:
import os
import random

import gcsfs
import google.auth
from google.colab import auth


import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

from typing_extensions import Literal
from typing import Callable, List, Tuple

In [9]:
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"


In [10]:
# set fixed seed
def seed_everything(seed):
    """
    Seeds basic parameters for reproducibility of results
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)


seed = 42
seed_everything(seed)


In [11]:
columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_PRICE",
    "price_ex_lag",
    "price_all_lag",
    "price_ex_lead",
    "price_all_lead",
    "TRADE_SIZE",
    "ask_size_ex",
    "bid_size_ex",
    "BEST_BID",
    "BEST_ASK",
    "bid_ex",
    "ask_ex",
]

train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)

X_print = pd.concat([train, val, test])


### Robustness

In [12]:
# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = 0

# # prepare columns for printing
# X_print["ttm"] = (
#     X_print["EXPIRATION"].dt.to_period("M")
#     - X_print["QUOTE_DATETIME"].dt.to_period("M")
# ).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
# X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [13]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """


    # fill others randomly with equal weights
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )
  
    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        # .assign(avg=lambda x: x.mean(axis=1))
        .mul(100)
    )
    return results


In [14]:
def combine_results(revised: pd.DataFrame, base: pd.DataFrame) -> pd.DataFrame:
    """
    Generate print layout like in Grauer et al.

    Example:
    TRADE_SIZE_binned	(0,1]	(1,3]	(3,5]	(5,11]	>11
    rule
    tick rule (all)	62.29 (10.38)	62.91 (10.27)	63.54 (11.08)	58.64 (5.62)	55.41 (-0.94)
    """
    combo = base.copy()
    for i, col in enumerate(combo.columns):
        col_other = revised.columns[i]
        combo[col] = (
            revised[col_other].round(2).astype(str)
            + " ("
            + (revised[col_other] - base[col]).round(2).astype(str)
            + ")"
        )
    return combo.T


## Classical rules

In [15]:
mapping = {"BEST_ASK": "ask_best", "BEST_BID": "bid_best"}
X_print.rename(columns=mapping, inplace=True)


In [16]:
def tick(subset: Literal["all", "ex"]) -> np.ndarray:
    """_summary_

    Args:
        subset (Literal[&quot;all&quot;, &quot;ex&quot;]): _description_

    Returns:
        np.array: _description_
    """
    """
    Classify a trade as a buy (sell) if its trade price is above (below) the closest different price of a previous trade.

    Args:
        subset (Literal[&quot;all&quot;, &quot;ex&quot;]): subset i. e., 'all' or 'ex'.

    Returns:
        np.ndarray: result of tick rule. Can be np.NaN.
    """
    # FIXME: Discuss with C. Grauer how to handle cases where e. g., 
    # "price_ex_lag" is missing.
    return np.where(
        X_print["TRADE_PRICE"] > X_print[f"price_{subset}_lag"],
        1,
        np.where(X_print["TRADE_PRICE"] < X_print[f"price_{subset}_lag"], -1, np.nan),
    )


In [17]:
def rev_tick(subset: Literal["all", "ex"]) -> np.ndarray:
    """
    Classify a trade as a sell (buy) if its trade price is below (above) the closest different price of a subsequent trade.

    Args:
        subset (Literal[&quot;all&quot;, &quot;ex&quot;]): subset i. e., 'all' or 'ex'.

    Returns:
        np.ndarray: result of reverse tick rule. Can be np.NaN.
    """
    return np.where(
        X_print[f"price_{subset}_lead"] > X_print["TRADE_PRICE"],
        -1,
        np.where(X_print[f"price_{subset}_lead"] < X_print["TRADE_PRICE"], 1, np.nan),
    )


In [18]:
def quote(subset: Literal["best", "ex"]) -> np.ndarray:
    """
    Classify a trade as a buy (sell) if its trade price is above (below) the midpoint of the bid and ask spread.
    Trades executed at the midspread are not classified.

    Args:
        subset (Literal[&quot;ex&quot;, &quot;best&quot;]): subset i. e., 'ex' or 'best'.

    Returns:
        np.ndarray: result of quote rule. Can be np.NaN.
    """
    mid = 0.5 * (X_print[f"ask_{subset}"] + X_print[f"bid_{subset}"])
    return np.where(
        X_print["TRADE_PRICE"] > mid,
        1,
        np.where(X_print["TRADE_PRICE"] < mid, -1, np.nan),
    )


In [19]:
def lr(subset: Literal["best", "ex"]) -> np.ndarray:
    """
    Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule),
    and use the tick test to classify midspread trades.

    Adapted from Lee and Ready (1991).
    Args:
        subset (Literal[&quot;ex&quot;, &quot;best&quot;]): subset i. e., 'ex' or 'best'.

    Returns:
        np.ndarray: result of the lee and ready algorithm with tick rule. Can be np.NaN.
    """
    qr = quote(subset)
    return np.where(~np.isnan(qr), qr, tick("ex"))


In [65]:
def rev_lr(subset: Literal["best", "ex"]) -> np.ndarray:
    """
    Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule),
    and use the reverse tick test to classify midspread trades.

    Adapted from Lee and Ready (1991).
    Args:
        subset (Literal[&quot;ex&quot;, &quot;best&quot;]): subset i. e., 'ex' or 'best'.

    Returns:
        np.ndarray: result of the lee and ready algorithm with reverse tick rule. Can be np.NaN.
    """
    qr = quote(subset)
    return np.where(~np.isnan(qr), qr, rev_tick("ex"))


In [21]:
def emo(subset: Literal["best", "ex"]) -> np.ndarray:
    """
    Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote,
    and use the tick test to classify all other trades.

    Adapted from Ellis et al. (2000).
    Args:
        subset (Literal[&quot;ex&quot;, &quot;best&quot;]): subset i. e., 'ex' or 'best'.

    Returns:
        np.ndarray: result of the emo algorithm with tick rule. Can be np.NaN.
    """
    at_ask = X_print["TRADE_PRICE"] == X_print[f"ask_{subset}"]
    at_bid = X_print["TRADE_PRICE"] == X_print[f"bid_{subset}"]
    at_ask_or_bid = at_ask ^ at_bid
    return np.where(at_ask_or_bid, quote(subset), tick("ex"))


In [22]:
def rev_emo(subset: Literal["best", "ex"]) -> np.ndarray:
    """
    Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote,
    and use the reverse tick test to classify all other trades.

    Adapted from Ellis et al. (2000).
    Args:
        subset (Literal[&quot;ex&quot;, &quot;best&quot;]): subset i. e., 'ex' or 'best'.

    Returns:
        np.ndarray: result of the emo algorithm with reverse tick rule. Can be np.NaN.
    """
    at_ask = X_print["TRADE_PRICE"] == X_print[f"ask_{subset}"]
    at_bid = X_print["TRADE_PRICE"] == X_print[f"bid_{subset}"]
    at_ask_or_bid = at_ask ^ at_bid
    return np.where(at_ask_or_bid, quote(subset), rev_tick("ex"))


In [23]:
def trade_size(subset: Literal["ex"]) -> np.ndarray:
    """
    Classify a trade as a buy (sell) the trade size matches exactly either the bid (ask) quote size.

    Adapted from Grauer et al. (2022).
    Args:
        subset (Literal[&quot;ex&quot;]): subset i. e., 'ex'.

    Returns:
        np.ndarray: result of the trade size rule. Can be np.NaN.
    """
    bid_eq_ask = X_print[f"ask_size_{subset}"] == X_print[f"bid_size_{subset}"]

    ts_eq_bid = (X_print["TRADE_SIZE"] == X_print[f"bid_size_{subset}"]) & -bid_eq_ask
    ts_eq_ask = (X_print["TRADE_SIZE"] == X_print[f"ask_size_{subset}"]) & -bid_eq_ask

    return np.where(ts_eq_bid, 1, np.where(ts_eq_ask, -1, np.nan))


In [24]:
def depth(subset: Literal["ex"]) -> np.ndarray:
    """
    Classify midspread trades as buy (sell), if the ask size (bid size) exceeds the bid size (ask size).

    Adapted from (Grauer et al., 2022).
    Args:
        subset (Literal[&quot;ex&quot;]): subset i. e., 'ex'.

    Returns:
        np.ndarray: result of the trade size rule. Can be np.NaN.
    """
    return np.where(
        X_print[f"ask_size_{subset}"] > X_print[f"bid_size_{subset}"],
        1,
        np.where(
            X_print[f"ask_size_{subset}"] < X_print[f"bid_size_{subset}"], -1, np.nan
        ),
    )


In [28]:
def predict_rules(layers: List[Tuple[Callable, str]], name: str = "default") -> None:
    """
    Stack several rules together.

    Start with first rule in list and apply next rule, if previous rule was unable to classify.
    Args:
        layers (List[Tuple[Callable,str]]): rules to be combined.
        name (str, optional): name of rule combination. Defaults to "default".
    """
    X_print["rule"] = name
    X_print["buy_sell_predicted"] = np.nan
    for func, subset in layers:
        X_print["buy_sell_predicted"] = np.where(
            X_print["buy_sell_predicted"].isna(),
            func(subset),
            X_print["buy_sell_predicted"],
        )


In [71]:
predict_rules(layers=[(trade_size, "ex"), (tick, "all")], name="tradesize + tick (all)")
ts_tick = check_robustness("TRADE_SIZE_binned")
ts_tick


rule,tradesize + tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",62.342988
"(1,3]",63.010958
"(3,5]",63.61166
"(5,11]",58.69885
>11,55.438648


In [47]:
# see p. 30
np.testing.assert_allclose(ts_tick.iloc[:,0].tolist(), [62.29,62.92,63.54,58.64,55.42], atol=0.5)

In [72]:
# TODO: result is not consistent with p. 36; 
# Group result on p. 35 is not consistent with p. 31.
ts_tick = check_robustness("year_binned")
ts_tick

rule,tradesize + tick (all)
year_binned,Unnamed: 1_level_1
2005-2007,64.478519
2008-2010,61.405014
2011,58.806137
2012,58.731031
2013,59.786685
2014,62.254549
2015,58.646937
2016,58.409921
2017,59.190965


In [73]:
ts_tick.iloc[:,0].tolist()

[64.47851871041864,
 61.405014425342365,
 58.80613654112125,
 58.73103060277525,
 59.78668547770959,
 62.25454911896809,
 58.64693674631162,
 58.40992133767341,
 59.190964781584086]

In [77]:
# see p. 36
np.testing.assert_allclose(ts_tick.iloc[:,0].tolist(), [65.44,62.26,59.18,58.94,59.74,61.87,58.49,58.48,59.36], atol=1.0)

In [None]:
predict_rules(layers=[(trade_size, "ex"), (tick, "ex")], name="tradesize + tick (ISE)")
ts_tick = check_robustness("TRADE_SIZE_binned")
ts_tick


rule,tradesize + tick (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.208434
"(1,3]",59.7124
"(3,5]",60.101709
"(5,11]",55.577579
>11,51.635248


In [49]:
predict_rules(
    layers=[(trade_size, "ex"), (rev_tick, "all")], name="tradesize + rev tick (all)"
)
ts_rev_tick = check_robustness("TRADE_SIZE_binned")
ts_rev_tick

rule,tradesize + rev tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",63.122621
"(1,3]",63.52602
"(3,5]",64.523123
"(5,11]",59.130781
>11,54.609329


In [50]:
# see p. 30
np.testing.assert_allclose(ts_rev_tick.iloc[:,0].tolist(), [63.51,63.87,64.94,59.53,55.05], atol=0.5)

In [51]:
predict_rules(
    layers=[(trade_size, "ex"), (quote, "best"), (quote, "ex")],
    name="Tradesize + Quote (NBBO) + Quote (ISE)",
)
quote_quote = check_robustness("TRADE_SIZE_binned")
quote_quote


rule,Tradesize + Quote (NBBO) + Quote (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.946554
"(1,3]",78.279782
"(3,5]",79.981009
"(5,11]",72.585307
>11,69.58424


In [52]:
# see p. 30
np.testing.assert_allclose(quote_quote.iloc[:,0].tolist(), [74.94,78.28,79.97,72.59,69.58], atol=0.5)

In [53]:
predict_rules(layers=[(trade_size, "ex"), (lr, "best")], name="tradesize + LR (NBBO)")
ts_lr = check_robustness("TRADE_SIZE_binned")
ts_lr


rule,tradesize + LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.395626
"(1,3]",77.728691
"(3,5]",79.411614
"(5,11]",72.152698
>11,69.191561


In [57]:
# see p. 30
np.testing.assert_allclose(ts_lr.iloc[:,0].tolist(), [74.09,77.31,78.96,71.57,68.40], atol=0.8)

In [58]:
predict_rules(
    layers=[(trade_size, "ex"), (rev_lr, "best")], name="tradesize + reverse LR (NBBO)"
)
ts_rev_lr = check_robustness("TRADE_SIZE_binned")
ts_rev_lr


rule,tradesize + reverse LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.531573
"(1,3]",77.854921
"(3,5]",79.573399
"(5,11]",72.279363
>11,69.240023


In [59]:
# see p. 30
np.testing.assert_allclose(ts_rev_lr.iloc[:,0].tolist(), [74.64,77.95,79.68,72.38,69.33], atol=0.5)

In [60]:
predict_rules(layers=[(rev_lr, "best")], name="reverse LR (NBBO)")
rev_lr = check_robustness("TRADE_SIZE_binned")
rev_lr


rule,reverse LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.622476
"(1,3]",62.576818
"(3,5]",63.012173
"(5,11]",64.53751
>11,69.879766


In [61]:
# see p. 30
np.testing.assert_allclose(rev_lr.iloc[:,0].tolist(), [59.48,62.43,62.83,64.43,69.91], atol=0.5)

In [62]:
combine_results(ts_rev_lr, rev_lr)

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
reverse LR (NBBO),74.53 (14.91),77.85 (15.28),79.57 (16.56),72.28 (7.74),69.24 (-0.64)


In [63]:
predict_rules(layers=[(trade_size, "ex"), (emo, "best")], name="tradesize + EMO (NBBO)")
ts_emo = check_robustness("TRADE_SIZE_binned")
ts_emo


rule,tradesize + EMO (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",66.831252
"(1,3]",69.821885
"(3,5]",71.699478
"(5,11]",64.965293
>11,61.842831


In [None]:
predict_rules(
    layers=[(trade_size, "ex"), (rev_emo, "best")],
    name="tradesize + reverse EMO (NBBO)",
)
ts_rev_emo = check_robustness("TRADE_SIZE_binned")
ts_rev_emo


rule,tradesize + reverse EMO (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",67.21739
"(1,3]",70.043514
"(3,5]",72.016237
"(5,11]",65.061046
>11,61.453471


In [66]:
predict_rules(
    layers=[(trade_size, "ex"), (rev_lr, "best"), (rev_lr, "ex"), (depth, "ex")],
    name="tradesize + reverse LR (NBBO, ISE) + depth",
)
ts_depth_rev_lr = check_robustness("TRADE_SIZE_binned")
ts_depth_rev_lr

rule,"tradesize + reverse LR (NBBO, ISE) + depth"
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.616508
"(1,3]",77.940045
"(3,5]",79.661481
"(5,11]",72.359737
>11,69.319072


In [69]:
# see p. 30
np.testing.assert_allclose(ts_depth_rev_lr.iloc[:,0].tolist(), [75.67,79.04,80.72,73.30,70.24], atol=1.2)

In [70]:
ts_depth_rev_lr = check_robustness("year_binned")
ts_depth_rev_lr

rule,"tradesize + reverse LR (NBBO, ISE) + depth"
year_binned,Unnamed: 1_level_1
2005-2007,80.87105
2008-2010,79.77494
2011,78.630053
2012,77.422361
2013,75.988921
2014,71.081818
2015,64.022345
2016,66.500689
2017,68.115142
