Run `pip install .` first to install all dependencies.

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

import sys

sys.path.append("..")
from otc.models.classical_classifier import ClassicalClassifier


In [3]:
columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_PRICE",
    "price_ex_lag",
    "price_all_lag",
    "price_ex_lead",
    "price_all_lead",
    "TRADE_SIZE",
    "ask_size_ex",
    "bid_size_ex",
    "BEST_BID",
    "BEST_ASK",
    "bid_ex",
    "ask_ex",
]

train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)

X_print = pd.concat([train, val, test])




In [4]:
test_index = test.index
del train, val, test

### Robustness Checks

In [5]:
# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = 0

# # prepare columns for printing
# self.X_["ttm"] = (
#     self.X_["EXPIRATION"].dt.to_period("M")
#     - self.X_["QUOTE_DATETIME"].dt.to_period("M")
# ).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
# X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [6]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others randomly with equal weights
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        # .assign(avg=lambda x: x.mean(axis=1))
        .mul(100)
    )
    return results


In [7]:
def combine_results(revised: pd.DataFrame, base: pd.DataFrame) -> pd.DataFrame:
    """
    Generate print layout like in Grauer et al.

    Example:
    TRADE_SIZE_binned	(0,1]	(1,3]	(3,5]	(5,11]	>11
    rule
    tick rule (all)	62.29 (10.38)	62.91 (10.27)	63.54 (11.08)	58.64 (5.62)	55.41 (-0.94)
    """
    combo = base.copy()
    for i, col in enumerate(combo.columns):
        col_other = revised.columns[i]
        combo[col] = (
            revised[col_other].round(2).astype(str)
            + " ("
            + (revised[col_other] - base[col]).round(2).astype(str)
            + ")"
        )
    return combo.T


## Classical Rules

In [8]:
seed = 42

### Tick Rule

In [9]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("tick", "all"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + tick (all)"

result = check_robustness("TRADE_SIZE_binned")
result


rule,tradesize + tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",62.344979
"(1,3]",63.012207
"(3,5]",63.612356
"(5,11]",58.701591
>11,55.441527


In [9]:
# see p. 30
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(), [62.29, 62.92, 63.54, 58.64, 55.42], atol=0.5
)


In [10]:
# TODO: result is not consistent with p. 36;
# Group result on p. 35 is not consistent with p. 31.
result = check_robustness("year_binned")
result


rule,tradesize + tick (all)
year_binned,Unnamed: 1_level_1
2005-2007,64.478336
2008-2010,61.403698
2011,58.807349
2012,58.727782
2013,59.791531
2014,62.262026
2015,58.650873
2016,58.414971
2017,59.195714


In [11]:
# see p. 36
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(),
    [65.44, 62.26, 59.18, 58.94, 59.74, 61.87, 58.49, 58.48, 59.36],
    atol=1.0,
)


In [12]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("tick", "ex"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + tick (all)"

result = check_robustness("TRADE_SIZE_binned")
result


rule,tradesize + tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.203776
"(1,3]",59.717659
"(3,5]",60.103457
"(5,11]",55.58381
>11,51.631437


### Reverse Tick Rule

In [13]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("rev_tick", "all"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "tradesize + tick (all)"

result = check_robustness("TRADE_SIZE_binned")
result


rule,tradesize + tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",63.129104
"(1,3]",63.532943
"(3,5]",64.520294
"(5,11]",59.128625
>11,54.608586


In [14]:
# see p. 30
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(), [63.51, 63.87, 64.94, 59.53, 55.05], atol=0.5
)


### Quote Rule

In [15]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + Quote (NBBO) + Quote (ISE)"

result = check_robustness("TRADE_SIZE_binned")
result


rule,Tradesize + Quote (NBBO) + Quote (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.944263
"(1,3]",78.279335
"(3,5]",79.981987
"(5,11]",72.59275
>11,69.5869


In [16]:
# see p. 30
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(), [74.94, 78.28, 79.97, 72.59, 69.58], atol=0.5
)


### LR Algorithm

In [17]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("lr", "best"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + LR (NBBO)"

result = check_robustness("TRADE_SIZE_binned")
result


rule,Tradesize + LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.434941
"(1,3]",77.766009
"(3,5]",79.458024
"(5,11]",72.20869
>11,69.256442


In [18]:
# see p. 30
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(), [74.09, 77.31, 78.96, 71.57, 68.40], atol=1
)


### Reverse LR Algorithm

In [19]:
clf = ClassicalClassifier(
    layers=[
        ("trade_size", "ex"),
        ("rev_lr", "best"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + rev. LR (NBBO)"

result_ts = check_robustness("TRADE_SIZE_binned")
result_ts


rule,Tradesize + rev. LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.592932
"(1,3]",77.927223
"(3,5]",79.656313
"(5,11]",72.351442
>11,69.32703


In [20]:
# see p. 30
np.testing.assert_allclose(
    result_ts.iloc[:, 0].tolist(), [74.64, 77.95, 79.68, 72.38, 69.33], atol=0.5
)


In [21]:
clf = ClassicalClassifier(
    layers=[
        ("rev_lr", "best"),
    ],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "reverse LR (NBBO)"

result = check_robustness("TRADE_SIZE_binned")
result


rule,reverse LR (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.769889
"(1,3]",62.717707
"(3,5]",63.160616
"(5,11]",64.665735
>11,70.000512


In [22]:
# see p. 30
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(), [59.48, 62.43, 62.83, 64.43, 69.91], atol=0.5
)


In [23]:
combine_results(result_ts, result)


TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
reverse LR (NBBO),74.59 (14.82),77.93 (15.21),79.66 (16.5),72.35 (7.69),69.33 (-0.67)


### EMO Algorithm

In [24]:
clf = ClassicalClassifier(
    layers=[("trade_size", "ex"), ("emo", "best")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + EMO (NBBO)"

check_robustness("TRADE_SIZE_binned")


rule,Tradesize + EMO (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",67.82061
"(1,3]",70.778904
"(3,5]",72.68337
"(5,11]",65.987162
>11,63.062609


## CLNV Method

In [25]:
clf = ClassicalClassifier(
    layers=[("clnv", "ex")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "CLNV (ex)"

check_robustness("year_binned")

rule,CLNV (ex)
year_binned,Unnamed: 1_level_1
2005-2007,57.188115
2008-2010,57.980047
2011,64.178287
2012,66.067017
2013,62.792854
2014,55.800243
2015,53.646344
2016,55.219678
2017,54.149427


In [26]:
clf = ClassicalClassifier(
    layers=[("trade_size", "ex"), ("clnv", "ex")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + CLNV (ex)"

check_robustness("year_binned")

rule,Tradesize + CLNV (ex)
year_binned,Unnamed: 1_level_1
2005-2007,76.818694
2008-2010,72.686674
2011,71.257758
2012,70.889375
2013,68.800751
2014,67.014615
2015,61.507266
2016,62.270877
2017,62.561512


In [27]:
clf = ClassicalClassifier(
    layers=[("trade_size", "ex"), ("depth","ex"), ("clnv", "ex")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + depth + CLNV (ex)"

check_robustness("year_binned")

rule,Tradesize + depth + CLNV (ex)
year_binned,Unnamed: 1_level_1
2005-2007,77.897405
2008-2010,73.9212
2011,72.729964
2012,71.927874
2013,69.768904
2014,68.07692
2015,62.56306
2016,62.944881
2017,63.067855


# Reverse CLNV method

In [10]:
clf = ClassicalClassifier(
    layers=[("rev_clnv", "ex")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Rev. CLNV (ex)"

check_robustness("year_binned")

rule,Rev. CLNV (ex)
year_binned,Unnamed: 1_level_1
2005-2007,57.883226
2008-2010,58.97571
2011,64.954371
2012,66.348056
2013,63.072901
2014,55.992405
2015,53.750526
2016,55.534062
2017,54.542281


In [11]:
clf = ClassicalClassifier(
    layers=[("trade_size", "ex"), ("rev_clnv", "ex")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + Rev. CLNV (ex)"

check_robustness("year_binned")

rule,Tradesize + Rev. CLNV (ex)
year_binned,Unnamed: 1_level_1
2005-2007,77.502189
2008-2010,73.666484
2011,72.015556
2012,71.022424
2013,68.690161
2014,66.726046
2015,61.269082
2016,62.200235
2017,62.650107


In [12]:
clf = ClassicalClassifier(
    layers=[("trade_size", "ex"), ("depth","ex"), ("rev_clnv", "ex")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + depth + Rev. CLNV (ex)"

check_robustness("year_binned")

rule,Tradesize + depth + Rev. CLNV (ex)
year_binned,Unnamed: 1_level_1
2005-2007,78.279793
2008-2010,74.606757
2011,73.238227
2012,72.070993
2013,69.706701
2014,67.827731
2015,62.354422
2016,62.887485
2017,63.179526


### Reverse LR Algorithm

In [13]:
clf = ClassicalClassifier(
    layers=[("trade_size", "ex"), ("depth", "ex"), ("rev_lr", "best")],
    random_state=seed,
)
clf.fit(X=X_print.loc[0:1, :], y=X_print["buy_sell"].loc[0:1])
X_print["buy_sell_predicted"] = clf.predict(X_print)
X_print["rule"] = "Tradesize + depth + rev. lr (NBBO)"

result = check_robustness("TRADE_SIZE_binned")
result

rule,Tradesize + depth + rev. lr (NBBO)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",75.165731
"(1,3]",78.555216
"(3,5]",80.229544
"(5,11]",72.853984
>11,69.68307


In [14]:
# see p. 30
np.testing.assert_allclose(
    result.iloc[:, 0].tolist(), [75.43, 78.83, 80.52, 73.23, 70.22], atol=0.6
)


In [15]:
check_robustness("year_binned")


rule,Tradesize + depth + rev. lr (NBBO)
year_binned,Unnamed: 1_level_1
2005-2007,81.285217
2008-2010,80.457831
2011,79.535931
2012,77.863734
2013,76.12962
2014,71.616233
2015,64.875378
2016,66.76859
2017,68.192941


In [16]:
# accuracy on test set only
X_print = X_print.iloc[test_index]
check_robustness("year_binned")

rule,Tradesize + depth + rev. lr (NBBO)
year_binned,Unnamed: 1_level_1
2015,63.369714
2016,66.76859
2017,68.192941


In [18]:
# overall accuracy score on test set
print(accuracy_score(X_print["buy_sell"], X_print["buy_sell_predicted"]))

0.6684498502064984
