<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/3.0-mb-classical_rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gcsfs==2022.10.0
!pip install numpy==1.23.4
!pip install pandas==1.5.1
!pip install fastparquet
!pip install scikit-learn==1.1.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement numpy==1.23.4 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0rc1, 1.13.0rc2, 1.13.0, 1.13.1, 1.13.3, 1.14.0rc1, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0rc1, 1.15.0rc2, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0rc1, 1.16.0rc2, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0rc1, 1.17.0rc2, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0rc1, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0rc1, 1.19.0rc2, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5, 1.20.0rc1, 1.20.0rc2, 1.20.0, 1.20.1, 1.20.2, 1

In [2]:
import os
import random

import gcsfs
import google.auth
from google.colab import auth


import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [4]:
# set fixed seed
def seed_everything(seed):
    """
    Seeds basic parameters for reproducibility of results
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)


seed = 42
seed_everything(seed)


In [71]:
# test = pd.read_parquet(
#     f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_2017.parquet",
#     engine="fastparquet",
# )

columns = ['buy_sell',"EXPIRATION", "QUOTE_DATETIME", "TRADE_PRICE", 
           "price_ex_lag", "price_all_lag","price_ex_lead", "price_all_lead", 
           "TRADE_SIZE","ask_size_ex","bid_size_ex", "BEST_BID", "BEST_ASK", 
           "bid_ex", "ask_ex"]

train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet",columns=columns
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",columns=columns
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",columns=columns
)

X_print = pd.concat([train, val, test])

### Robustness

In [83]:
# Copy unscaled columns
#X_print = test.copy()
# X_print = pd.concat([train, val, test])

# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = 0

# # prepare columns for printing
# X_print["ttm"] = (
#     X_print["EXPIRATION"].dt.to_period("M")
#     - X_print["QUOTE_DATETIME"].dt.to_period("M")
# ).apply(lambda x: x.n)
X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
#X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [7]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others with
    # X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
    #     lambda l: l if not np.isnan(l) else 0
    # )

    # # fill others randomly
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )
    # filter = X_print["buy_sell_predicted"].notna()

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        #.assign(avg=lambda x: x.mean(axis=1))
        .mul(100)
    )
    return results


In [8]:
def combine_results(revised:pd.DataFrame, base:pd.DataFrame)->pd.DataFrame:
  """
  Generate print layout like in Grauer et al.

  Example:
  TRADE_SIZE_binned	(0,1]	(1,3]	(3,5]	(5,11]	>11
  rule					
  tick rule (all)	62.29 (10.38)	62.91 (10.27)	63.54 (11.08)	58.64 (5.62)	55.41 (-0.94)
  """
  combo = base.copy()
  for i, col in enumerate(combo.columns):
    col_other = revised.columns[i]
    combo[col] = revised[col_other].round(2).astype(str) + ' (' + (revised[col_other] - base[col]).round(2).astype(str) + ')'
  return combo.T

## Classical rules

In [51]:
X_print.columns.tolist()

['buy_sell',
 'EXPIRATION',
 'QUOTE_DATETIME',
 'TRADE_PRICE',
 'price_ex_lag',
 'price_all_lag',
 'price_ex_lead',
 'price_all_lead',
 'TRADE_SIZE',
 'ask_size_ex',
 'bid_size_ex',
 'rule',
 'buy_sell_predicted',
 'year',
 'TRADE_SIZE_binned',
 'year_binned']

In [76]:
mapping = {'BEST_ASK':'ask_best','BEST_BID':'bid_best'}
X_print.rename(columns=mapping, inplace=True)

In [52]:
def tick(subset:str):
  return np.where(X_print["TRADE_PRICE"] > X_print[f"price_{subset}_lag"], 1,
              np.where(X_print["TRADE_PRICE"] < X_print[f"price_{subset}_lag"],-1, 
                       np.nan))

In [81]:
def rev_tick(subset:str):
  return np.where(
     X_print[f"price_{subset}_lead"] > X_print["TRADE_PRICE"], -1,
    np.where(X_print[f"price_{subset}_lead"] < X_print["TRADE_PRICE"], 1, 
             np.nan))


In [74]:
def quote(subset:str):
  mid = 0.5 * (X_print[f"ask_{subset}"] + X_print[f"bid_{subset}"])
  return np.where(
      X_print["TRADE_PRICE"] > mid, 1, 
      np.where(X_print["TRADE_PRICE"] < mid, -1, np.nan)
  )

In [None]:
def lr(subset:str):
  pass

In [None]:
def rev_lr(subset:str):
  pass

In [None]:
def emo(subset:str):
  pass

In [None]:
def rev_emo(subset:str):
  pass

In [65]:
def trade_size(subset:str):
  bid_eq_ask = X_print[f"ask_size_{subset}"] == X_print[f"bid_size_{subset}"]

  ts_eq_bid = (X_print["TRADE_SIZE"] == X_print[f"bid_size_{subset}"]) & -bid_eq_ask
  ts_eq_ask = (X_print["TRADE_SIZE"] == X_print[f"ask_size_{subset}"]) & -bid_eq_ask

  return np.where(ts_eq_bid, 1, np.where(ts_eq_ask, -1, np.nan))

In [None]:
def depth(subset:str):
  return np.where(X_print[f"ask_size_{subset}"] > X_print[f"bid_size_{subset}"],1,
    np.where(X_print[f"ask_size_{subset}"] < X_print[f"bid_size_{subset}"], -1, np.nan),
)

In [63]:
def predict_rules(layers:list, name:str="default")->None:
  """
  stack several rules.
  """
  X_print["rule"] = name
  X_print["buy_sell_predicted"] = np.nan
  for func, subset in layers:
    X_print["buy_sell_predicted"] = np.where(X_print["buy_sell_predicted"].isna(), 
                                             func(subset), X_print["buy_sell_predicted"])

In [67]:
predict_rules(layers=[(trade_size,"ex"), (tick, "all")], name="tradesize + tick (all)")
ts_tick = check_robustness("TRADE_SIZE_binned")
ts_tick

rule,tradesize + tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",62.344115
"(1,3]",63.014328
"(3,5]",63.609735
"(5,11]",58.695144
>11,55.441475


In [68]:
predict_rules(layers=[(trade_size,"ex"), (tick, "ex")], name="tradesize + tick (ISE)")
ts_tick = check_robustness("TRADE_SIZE_binned")
ts_tick

rule,tradesize + tick (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",59.208434
"(1,3]",59.7124
"(3,5]",60.101709
"(5,11]",55.577579
>11,51.635248


In [84]:
predict_rules(layers=[(trade_size,"ex"), (rev_tick, "all")], name="tradesize + rev tick (all)")
ts_rev_tick = check_robustness("TRADE_SIZE_binned")
ts_rev_tick

rule,tradesize + rev tick (all)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",63.128594
"(1,3]",63.52601
"(3,5]",64.523197
"(5,11]",59.130022
>11,54.614042


In [79]:
predict_rules(layers=[(trade_size,"ex"), (quote,"best"), (quote, "ex")], name="Tradesize + Quote (NBBO) + Quote (ISE)")
quote_quote = check_robustness("TRADE_SIZE_binned")
quote_quote

rule,Tradesize + Quote (NBBO) + Quote (ISE)
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",74.948808
"(1,3]",78.273528
"(3,5]",79.979232
"(5,11]",72.595542
>11,69.592555


In [169]:
combine_results(ts_ts_rev_tick, ts_rev_tick)

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
reverse tick rule,63.13 (7.85),63.53 (7.87),64.52 (8.42),59.13 (4.25),54.61 (0.05)


In [170]:
isna_sum_ts_binned = X_print.groupby(["TRADE_SIZE_binned"]).agg({'price_all_lead': lambda x: x.isnull().sum()})
isna_sum_ts_binned

Unnamed: 0_level_0,price_all_lead
TRADE_SIZE_binned,Unnamed: 1_level_1
"(0,1]",431123
"(1,3]",307379
"(3,5]",221001
"(5,11]",352634
>11,310578
