<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/3.0-mb-classical_rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gcsfs==2022.10.0
!pip install numpy==1.23.4
!pip install pandas==1.5.1
!pip install fastparquet
!pip install scikit-learn==1.1.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement numpy==1.23.4 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0rc1, 1.13.0rc2, 1.13.0, 1.13.1, 1.13.3, 1.14.0rc1, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0rc1, 1.15.0rc2, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0rc1, 1.16.0rc2, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0rc1, 1.17.0rc2, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0rc1, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0rc1, 1.19.0rc2, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5, 1.20.0rc1, 1.20.0rc2, 1.20.0, 1.20.1, 1.20.2, 1

In [2]:
import os
import random

import gcsfs
import google.auth
from google.colab import auth


import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [4]:
# set fixed seed
def seed_everything(seed):
    """
    Seeds basic parameters for reproducibility of results
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)


seed = 42
seed_everything(seed)


In [91]:
# test = pd.read_parquet(
#     f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_2017.parquet",
#     engine="fastparquet",
# )

columns = ['buy_sell',"EXPIRATION", "QUOTE_DATETIME", "TRADE_PRICE", 
           "price_ex_lag", "price_all_lag","price_ex_lead", "price_all_lead", 
           "TRADE_SIZE","ask_size_ex","bid_size_ex"]

train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet",columns=columns
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",columns=columns
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",columns=columns
)

X_print = pd.concat([train, val, test])

### Robustness

In [92]:
# Copy unscaled columns
#X_print = test.copy()
# X_print = pd.concat([train, val, test])

# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = 0

# # prepare columns for printing
# X_print["ttm"] = (
#     X_print["EXPIRATION"].dt.to_period("M")
#     - X_print["QUOTE_DATETIME"].dt.to_period("M")
# ).apply(lambda x: x.n)
X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [0, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2005, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [0, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
#X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [149]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others with
    # X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
    #     lambda l: l if not np.isnan(l) else 0
    # )

    # # fill others randomly
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )
    # filter = X_print["buy_sell_predicted"].notna()

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        #.assign(avg=lambda x: x.mean(axis=1))
        .mul(100)
    )
    return results


In [None]:
def combine_results(revised:pd.DataFrame, base:pd.DataFrame)->pd.DataFrame:
  """
  Generate print layout like in Grauer et al.

  Example:
  TRADE_SIZE_binned	(0,1]	(1,3]	(3,5]	(5,11]	>11
  rule					
  tick rule (all)	62.29 (10.38)	62.91 (10.27)	63.54 (11.08)	58.64 (5.62)	55.41 (-0.94)
  """
  combo = base.copy()
  for i, col in enumerate(combo.columns):
    col_other = revised.columns[i]
    combo[col] = revised[col_other].round(2).astype(str) + ' (' + (revised[col_other] - base[col]).round(2).astype(str) + ')'
  return combo.T

## Classical rules

In [150]:
# tick rule
# FIXME: Discuss with Grauer et al what is used in table 9 ISE at '=='? How is their accuracy defined?

subset = "all"

# print(X_print[f"price_{subset}_lag"].isna().sum())
# X_print[f"price_{subset}_lag"].fillna(0, inplace=True)
filter = X_print[f"price_{subset}_lag"].isna()
print(filter.sum())

# FIXME: adjustment to classical algorithm if no price_all_lag is found.
# np.where(X_print[f"price_{subset}_lag"].notna(), 1, np.nan)
tt = np.where(X_print["TRADE_PRICE"] < X_print[f"price_{subset}_lag"], -1, 1)

X_print["buy_sell_predicted"] = tt
# X_print["buy_sell_predicted"][filter] = np.nan

X_print["rule"] = f"tick test ({subset})"

# 2017	0.535319	0.535319
# soll 53,22 (table 3) 54,46 (table 9)

# 2005-2007	0.508497	0.508497
# 2008-2010	0.520588	0.520588
# 2011	0.547742	0.547742
# 2012	0.558078	0.558078
# 2013	0.552743	0.552743
# 2014	0.552471	0.552471
# 2015	0.538636	0.538636
# 2016	0.535457	0.535457
# 2017	0.538028	0.538028

1055754


In [154]:
accuracy_score(X_print["buy_sell"], X_print["buy_sell_predicted"])

0.5321308761302265

In [155]:
check_robustness("year_binned")


rule,tick test (all)
year_binned,Unnamed: 1_level_1
2005-2007,50.906782
2008-2010,51.977173
2011,54.622561
2012,55.54079
2013,55.073888
2014,54.991957
2015,53.515099
2016,53.020268
2017,53.531859


In [156]:
ts_tick = check_robustness("TRADE_SIZE_binned")
ts_tick

rule,tick test (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",51.914686
"(1,3]",52.638455
"(3,5]",52.458272
"(5,11]",53.021104
>11,56.345046


In [157]:
# trade size rule

subset = "ex"

bid_eq_ask = X_print[f"ask_size_{subset}"] == X_print[f"bid_size_{subset}"]

# "matches either"
ts_eq_bid = (X_print["TRADE_SIZE"] == X_print[f"bid_size_{subset}"]) & -bid_eq_ask
ts_eq_ask = (X_print["TRADE_SIZE"] == X_print[f"ask_size_{subset}"]) & -bid_eq_ask

# trade size + tick rule
X_print["buy_sell_predicted"] = np.where(ts_eq_bid, 1.0, np.where(ts_eq_ask, -1.0, tt))


X_print["rule"] = "trade size + tick test"


In [158]:
accuracy_score(X_print["buy_sell"], X_print["buy_sell_predicted"])

0.6052803661477245

In [159]:
check_robustness("year_binned")

rule,trade size + tick test
year_binned,Unnamed: 1_level_1
2005-2007,65.047677
2008-2010,61.389279
2011,58.762266
2012,58.645439
2013,59.729238
2014,62.203745
2015,58.5238
2016,58.2238
2017,59.0938


In [160]:
ts_ts_tick = check_robustness("TRADE_SIZE_binned")

In [161]:
combine_results(ts_ts_tick, ts_tick)

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tick test (all),62.29 (10.38),62.91 (10.28),63.54 (11.08),58.64 (5.61),55.41 (-0.93)


In [162]:
# FIXME: look at edge case where price_all_lag is null / not there

isna_sum_ts_binned = X_print.groupby(["TRADE_SIZE_binned"]).agg({'price_all_lag': lambda x: x.isnull().sum()})
isna_sum_ts_binned

Unnamed: 0_level_0,price_all_lag
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",300619
"(1,3]",206402
"(3,5]",141241
"(5,11]",220487
>11,187005


## Reverse tick test

In [163]:
# reverse tick rule
subset = "all"

rev_tt = np.where(
     X_print[f"price_{subset}_lead"] < X_print["TRADE_PRICE"],
    1,
    np.where(X_print[f"price_{subset}_lead"] > X_print["TRADE_PRICE"], -1, np.NaN),
)

X_print["buy_sell_predicted"] = rev_tt 
filter = X_print[f"price_{subset}_lead"].isna()
print(filter.sum())

X_print["rule"] = "reverse tick rule"


1622715


In [164]:
check_robustness("year_binned")

rule,reverse tick rule
year_binned,Unnamed: 1_level_1
2005-2007,56.548399
2008-2010,55.839899
2011,55.81106
2012,55.886015
2013,55.118399
2014,55.053
2015,53.533763
2016,53.862434
2017,54.649611


In [165]:
ts_rev_tick = check_robustness("TRADE_SIZE_binned")
ts_rev_tick

rule,reverse tick rule
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",55.275953
"(1,3]",55.659477
"(3,5]",56.106639
"(5,11]",54.882484
>11,54.55759


In [167]:
# trade size + rev tick rule

X_print["buy_sell_predicted"] = np.where(ts_eq_bid, 1.0, np.where(ts_eq_ask, -1.0, rev_tt))
X_print["rule"] = "trade size + tick test"


In [168]:
ts_ts_rev_tick = check_robustness("TRADE_SIZE_binned")
ts_ts_rev_tick

rule,trade size + tick test
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",63.126941
"(1,3]",63.52869
"(3,5]",64.523715
"(5,11]",59.128081
>11,54.606471


In [169]:
combine_results(ts_ts_rev_tick, ts_rev_tick)

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
reverse tick rule,63.13 (7.85),63.53 (7.87),64.52 (8.42),59.13 (4.25),54.61 (0.05)


In [170]:
isna_sum_ts_binned = X_print.groupby(["TRADE_SIZE_binned"]).agg({'price_all_lead': lambda x: x.isnull().sum()})
isna_sum_ts_binned

Unnamed: 0_level_0,price_all_lead
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",431123
"(1,3]",307379
"(3,5]",221001
"(5,11]",352634
>11,310578


## Quote rule

In [None]:
# quote rule
# TODO: Variants QR (NBBO), QUOTE_RULE (ISE) etc.
subset = 'ex'

mid = 0.5 * (X_print[f"ask_{subset}"] + X_print[f"bid_{subset}"])
qr = np.where(
    X_print["TRADE_PRICE"] > mid, 1, np.where(X_print["TRADE_PRICE"] < mid, -1, np.nan)
)
X_print["buy_sell_predicted"] = qr

X_print["rule"] = "quote rule"


In [None]:
# check_robustness("year_binned")

In [None]:
# X_print["buy_sell_predicted"] = np.where(ts_eq_bid, 1, np.where(ts_eq_ask, -1, qr))

# X_print["rule"] = "trade size + quote rule"


In [None]:
# check_robustness("year_binned")


In [None]:
# # depth rule p. 14
# dr = np.where(
#     X_print["ask_size_ex"] > X_print["bid_size_ex"],
#     1,
#     np.where(X_print["ask_size_ex"] < X_print["bid_size_ex"], -1, np.nan),
# )

# X_print["buy_sell_predicted"] = dr
# X_print["rule"] = "depth rule"


In [None]:
# check_robustness("year_binned")


In [None]:
# TODO: Depth rule + reverse LR (NBBO), Depth rule + reverse LR (NBBO, ISE), ...

# X_print["buy_sell_predicted"] = np.where(ts_eq_bid, 1.0, np.where(ts_eq_ask, -1.0, dr))

# X_print["rule"] = "trade size + depth rule"


In [None]:
# check_robustness("year_binned")
