<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/fix-classical-clf/notebooks/4.0a-mb-classical_rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run `pip install .` first to install all dependencies.

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

from typing_extensions import Literal
from typing import Callable, List, Tuple

import sys

sys.path.append("..")
from otc.models.objective import set_seed
from data.fs import fs
from otc.models.classical_classifier import ClassicalClassifier


In [None]:
print(fs.buckets)


In [None]:
# set seed for reproducability
seed = set_seed()


In [None]:
columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_PRICE",
    "price_ex_lag",
    "price_all_lag",
    "price_ex_lead",
    "price_all_lead",
    "TRADE_SIZE",
    "ask_size_ex",
    "bid_size_ex",
    "BEST_BID",
    "BEST_ASK",
    "bid_ex",
    "ask_ex",
]

train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)

X_print = pd.concat([train, val, test])


### Robustness Checks

In [None]:
# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = 0

# # prepare columns for printing
# self.X_["ttm"] = (
#     self.X_["EXPIRATION"].dt.to_period("M")
#     - self.X_["QUOTE_DATETIME"].dt.to_period("M")
# ).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
# X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [None]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others randomly with equal weights
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        # .assign(avg=lambda x: x.mean(axis=1))
        .mul(100)
    )
    return results


In [None]:
def combine_results(revised: pd.DataFrame, base: pd.DataFrame) -> pd.DataFrame:
    """
    Generate print layout like in Grauer et al.

    Example:
    TRADE_SIZE_binned	(0,1]	(1,3]	(3,5]	(5,11]	>11
    rule
    tick rule (all)	62.29 (10.38)	62.91 (10.27)	63.54 (11.08)	58.64 (5.62)	55.41 (-0.94)
    """
    combo = base.copy()
    for i, col in enumerate(combo.columns):
        col_other = revised.columns[i]
        combo[col] = (
            revised[col_other].round(2).astype(str)
            + " ("
            + (revised[col_other] - base[col]).round(2).astype(str)
            + ")"
        )
    return combo.T


## Classical Rules

### Tick Rule

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("tick", "all"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + tick (all)"

check_robustness("TRADE_SIZE_binned")

rule,tradesize + tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",62.342988
"(1,3]",63.010958
"(3,5]",63.61166
"(5,11]",58.69885
>11,55.438648


In [None]:
# see p. 30
np.testing.assert_allclose(
    ts_tick.iloc[:, 0].tolist(), [62.29, 62.92, 63.54, 58.64, 55.42], atol=0.5
)


In [None]:
# TODO: result is not consistent with p. 36;
# Group result on p. 35 is not consistent with p. 31.
check_robustness("year_binned")


rule,tradesize + tick (all)
year_binned,Unnamed: 1_level_1
2005-2007,64.478519
2008-2010,61.405014
2011,58.806137
2012,58.731031
2013,59.786685
2014,62.254549
2015,58.646937
2016,58.409921
2017,59.190965


In [None]:
# see p. 36
np.testing.assert_allclose(
    ts_tick.iloc[:, 0].tolist(),
    [65.44, 62.26, 59.18, 58.94, 59.74, 61.87, 58.49, 58.48, 59.36],
    atol=1.0,
)

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("tick", "ex"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + tick (all)"

check_robustness("TRADE_SIZE_binned")

rule,tradesize + tick (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.208434
"(1,3]",59.7124
"(3,5]",60.101709
"(5,11]",55.577579
>11,51.635248


### Reverse Tick Rule

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("rev_tick", "all"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + tick (all)"

check_robustness("TRADE_SIZE_binned")


rule,tradesize + rev tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",63.122621
"(1,3]",63.52602
"(3,5]",64.523123
"(5,11]",59.130781
>11,54.609329


In [None]:
# see p. 30
np.testing.assert_allclose(
    ts_rev_tick.iloc[:, 0].tolist(), [63.51, 63.87, 64.94, 59.53, 55.05], atol=0.5
)


### Quote Rule

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + Quote (NBBO) + Quote (ISE)"

check_robustness("TRADE_SIZE_binned")

rule,Tradesize + Quote (NBBO) + Quote (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.946554
"(1,3]",78.279782
"(3,5]",79.981009
"(5,11]",72.585307
>11,69.58424


In [None]:
# see p. 30
np.testing.assert_allclose(
    quote_quote.iloc[:, 0].tolist(), [74.94, 78.28, 79.97, 72.59, 69.58], atol=0.5
)


### LR Algorithm

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("lr", "best"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + LR (NBBO)"

check_robustness("TRADE_SIZE_binned")

rule,tradesize + LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.395626
"(1,3]",77.728691
"(3,5]",79.411614
"(5,11]",72.152698
>11,69.191561


In [None]:
# see p. 30
np.testing.assert_allclose(
    ts_lr.iloc[:, 0].tolist(), [74.09, 77.31, 78.96, 71.57, 68.40], atol=0.8
)


### Reverse LR Algorithm

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("rev_lr", "best"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + rev. LR (NBBO)"

check_robustness("TRADE_SIZE_binned")

rule,tradesize + reverse LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.531573
"(1,3]",77.854921
"(3,5]",79.573399
"(5,11]",72.279363
>11,69.240023


In [None]:
# see p. 30
np.testing.assert_allclose(
    ts_rev_lr.iloc[:, 0].tolist(), [74.64, 77.95, 79.68, 72.38, 69.33], atol=0.5
)


In [None]:
clf = ClassicalClassifier(
    layers=[
        ("rev_lr", "best"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "reverse LR (NBBO)"

check_robustness("TRADE_SIZE_binned")

rule,reverse LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.622476
"(1,3]",62.576818
"(3,5]",63.012173
"(5,11]",64.53751
>11,69.879766


In [None]:
# see p. 30
np.testing.assert_allclose(
    rev_lr.iloc[:, 0].tolist(), [59.48, 62.43, 62.83, 64.43, 69.91], atol=0.5
)


In [None]:
combine_results(ts_rev_lr, rev_lr_best)


TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
reverse LR (NBBO),74.53 (14.91),77.85 (15.28),79.57 (16.56),72.28 (7.74),69.24 (-0.64)


### EMO Algorithm

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"), ("emo", "best")
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + EMO (NBBO)"

check_robustness("TRADE_SIZE_binned")

rule,tradesize + EMO (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",66.831252
"(1,3]",69.821885
"(3,5]",71.699478
"(5,11]",64.965293
>11,61.842831


### Reverse LR Algorithm

In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"), ("rev_emo", "best")
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + rev. EMO (NBBO)"

check_robustness("TRADE_SIZE_binned")

rule,tradesize + reverse EMO (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",67.21739
"(1,3]",70.043514
"(3,5]",72.016237
"(5,11]",65.061046
>11,61.453471


In [None]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"), ("rev_lr", "best"),("rev_lr", "ex"), ("depth", "ex")
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1,:], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + reverse LR (NBBO, ISE) + depth"

check_robustness("TRADE_SIZE_binned")

rule,"tradesize + reverse LR (NBBO, ISE) + depth"
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.616576
"(1,3]",77.940045
"(3,5]",79.661022
"(5,11]",72.359943
>11,69.319176


In [None]:
# see p. 30
np.testing.assert_allclose(
    ts_depth_rev_lr.iloc[:, 0].tolist(), [75.67, 79.04, 80.72, 73.30, 70.24], atol=1.2
)


In [None]:
check_robustness("year_binned")


rule,"tradesize + reverse LR (NBBO, ISE) + depth"
year_binned,Unnamed: 1_level_1
2005-2007,80.87105
2008-2010,79.77494
2011,78.630053
2012,77.422361
2013,75.988921
2014,71.081818
2015,64.022345
2016,66.500689
2017,68.115142
