In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

sys.path.append("..")


import wandb

from tqdm.notebook import tqdm


In [2]:
# set here globally
exchange = "ise"
models = "classical"
subset = "all" # "test"
strategy = "supervised"

key = f"{exchange}_{models}_{strategy}_{subset}"

In [3]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

dataset = f"fbv/thesis/{exchange}_{strategy}_unscaled:latest"
results = f"fbv/thesis/{key}:latest"

# load unscaled data
artifact = run.use_artifact(dataset) # type: ignore
data_dir = artifact.download()

# load results
artifact = run.use_artifact(results) # type: ignore
results_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact ise_supervised_unscaled:latest, 3391.53MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact ise_classical_supervised_all:latest, 316.18MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0


In [4]:
# p. 35-38
columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "ask_ex",
    "bid_ex",
    "myn",
    "OPTION_TYPE",
    "issue_type"
]


if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set_extended_60"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set_extended_20"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set_extended_20"), engine="fastparquet", columns=columns
    )
    eval_data = pd.concat([train,val,test])
    del train, val, test
    
elif subset == "test":
    eval_data = pd.read_parquet(
        Path(data_dir, "test_set_extended_20"), engine="fastparquet", columns=columns
    )

results_data = pd.read_parquet(
    Path(results_dir, "results"), engine="fastparquet"
)


assert len(eval_data) == len(results_data)

X_print = eval_data


### Robustness Checks

In [5]:
X_print.head()

Unnamed: 0,buy_sell,EXPIRATION,QUOTE_DATETIME,TRADE_SIZE,TRADE_PRICE,ask_ex,bid_ex,myn,OPTION_TYPE,issue_type
0,1,2006-01-21,2005-05-02 09:30:02,10,2.05,2.1,1.9,1.742,C,0
1,1,2005-06-18,2005-05-02 09:30:03,10,3.9,,,1.235,C,0
2,-1,2005-05-21,2005-05-02 09:30:03,50,11.2,11.4,11.1,1.105381,C,%
3,1,2005-06-18,2005-05-02 09:30:03,10,0.2,0.25,0.0,0.799,C,0
4,-1,2005-12-17,2005-05-02 09:30:03,15,0.25,0.45,0.25,0.826429,C,0


In [6]:

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

# p. 38
bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

# p. 37
bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# Security type
# see 3.0a-mb-explanatory-data-analysis.ipynb
X_print["issue_type"] = X_print["issue_type"].map({'0': 'Stock options', 'A': 'Index option', '7': 'Others', 'F':'Others', '%': 'Others', ' ': 'Others'})

# Moneyness p. 38
bins_myn = [-1, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "myn <= 0.7",
    "myn (0.7-0.9]",
    "myn (0.9-1.1]",
    "myn (1.1-1.3]",
    "myn > 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

# proximity to quotes
mid_ex = 0.5 * (X_print["ask_ex"] + X_print["bid_ex"])
spread_ex = X_print["ask_ex"] - X_print["bid_ex"]
X_print["prox_q"] = (X_print["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
X_print["mid"] = mid_ex

# p. 31
def map_quotes(x):
    abs_x = np.abs(x)
    if 1 < x < np.inf:
        return "outside"
    elif x == 0:
        return "at mid"
    elif x == 1:
        return "at quote"
    return "inside"
    
X_print["prox_q_binned"] = X_print["prox_q"].apply(map_quotes)

# clean up empty buckets, as it causes empty grouping in result set generatio
X_print["year_binned"] = X_print["year_binned"].cat.remove_unused_categories()
X_print["myn_binned"] = X_print["myn_binned"].cat.remove_unused_categories()
X_print["ttm_binned"] = X_print["ttm_binned"].cat.remove_unused_categories()


X_print.drop(columns=["EXPIRATION","QUOTE_DATETIME", "TRADE_SIZE", "ttm", "myn", "prox_q", "ask_ex", "bid_ex", "year"], inplace=True)


In [7]:
X_print.head(20)

Unnamed: 0,buy_sell,TRADE_PRICE,OPTION_TYPE,issue_type,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,mid,prox_q_binned
0,1,2.05,C,Stock options,"(5,11]",2005-2007,ttm (6-12] month,myn > 1.3,2.0,inside
1,1,3.9,C,Stock options,"(5,11]",2005-2007,ttm <= 1 month,myn (1.1-1.3],,inside
2,-1,11.2,C,Others,>11,2005-2007,ttm <= 1 month,myn (1.1-1.3],11.25,inside
3,1,0.2,C,Stock options,"(5,11]",2005-2007,ttm <= 1 month,myn (0.7-0.9],0.125,inside
4,-1,0.25,C,Stock options,>11,2005-2007,ttm (6-12] month,myn (0.7-0.9],0.35,inside
5,-1,3.0,P,Stock options,>11,2005-2007,ttm (3-6] month,myn (0.9-1.1],3.05,inside
6,1,0.55,C,Stock options,"(1,3]",2005-2007,ttm (3-6] month,myn <= 0.7,0.45,inside
7,-1,2.75,P,Stock options,"(5,11]",2005-2007,ttm (3-6] month,myn > 1.3,2.9,inside
8,1,0.6,C,Stock options,"(0,1]",2005-2007,ttm (3-6] month,myn (0.7-0.9],0.525,outside
9,1,11.0,C,Stock options,"(5,11]",2005-2007,ttm (2-3] month,,,inside


In [8]:
X_print = pd.concat([X_print, results_data], axis=1)

## Results Set Generation

In [9]:
LUT = {"(ex)": " (ex)", 
       "(best)": " (best)",
       "rev_": "Rev. ",
       "tick": "Tick",
       "quote": "Quote",
       "trade_size": "Trade Size",
       "depth": "Depth",
       "->": " $\\to$ ",
       "lr": "\gls{LR}",
       "emo": "\gls{EMO}", 
       "clnv": "\gls{CLNV}",
      "OPTION_TYPE": "Option Type",
      "(":"$(", # put interval start in math env
      "]":"]$", # put interval end in math env
      "_":"$\_"
      }
       
def cell_str(x):
    for orig, sub in LUT.items():
        x = x.replace(orig,sub)
    return x

In [10]:
def set_tex_style(styler, caption, label):
    res = (
    styler.set_caption(caption)
    # .hide(axis="index")
    .format_index(cell_str, axis = 0)
    .format_index(cell_str, axis = 1)
    .format(precision=4, decimal='.', thousands=",", escape=False, hyperlinks=None)
    .to_latex(f"{label}.tex", siunitx=True, position_float="centering", hrules=True, clines="skip-last;data",
                      label="tab:"+label, caption=caption)
             )
    return res

In [11]:
classifiers = results_data.columns.tolist()
criterions = ["OPTION_TYPE", "issue_type", "TRADE_SIZE_binned", "year_binned","ttm_binned","myn_binned","prox_q_binned"]

In [12]:
result_dfs = []

for criterion in tqdm(criterions):
    results = []
    for classifier in tqdm(classifiers):
        res = (X_print.groupby([criterion])[["buy_sell", classifier]]
            .apply(lambda x: accuracy_score(x["buy_sell"], x[classifier]))
            .mul(100)
            .rename(classifier))
        results.append(res)

    # save aggregated results
    result_df = pd.concat(results, axis=1).T
    result_df.style.pipe(set_tex_style, caption=(f"long-tbd","short-tbd"), label=f"{key.lower()}-{criterion.lower()}")

    # store all result sets for later use
    result_dfs.append(result_df)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [13]:
result_dfs[0]

OPTION_TYPE,C,P
tick(all),53.32064,53.305616
tick(ex),49.406551,50.014733
quote(best),63.642245,63.813381
quote(ex),62.68174,62.634438
lr(ex),62.613399,62.585816
lr(best),63.551623,63.729395
rev_lr(ex),62.809223,62.756181
rev_lr(best),63.6818,63.852476
emo(ex),55.653509,55.126481
emo(best),57.300032,56.884877


In [14]:
result_dfs[1]

issue_type,Index option,Others,Stock options
tick(all),51.096852,52.710833,53.555241
tick(ex),49.946253,49.032852,49.873137
quote(best),61.392291,65.353648,63.235512
quote(ex),60.56298,64.425438,62.135025
lr(ex),60.57225,64.319831,62.088789
lr(best),61.358054,65.247848,63.152578
rev_lr(ex),60.57509,64.555439,62.260435
rev_lr(best),61.425182,65.394896,63.274319
emo(ex),49.472916,56.375943,55.233955
emo(best),49.643503,58.145891,56.933079


In [15]:
result_dfs[2]

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
tick(all),52.009861,52.780268,52.576993,53.132934,56.388859
tick(ex),48.92774,49.606935,49.202498,49.559466,51.207259
quote(best),59.648388,62.631963,63.057502,64.617288,70.050168
quote(ex),58.613592,61.605109,62.041579,63.588797,68.884684
lr(ex),58.588543,61.561729,61.954874,63.496833,68.811164
lr(best),59.566481,62.529499,62.941639,64.523219,69.995727
rev_lr(ex),58.773731,61.758749,62.207363,63.701634,68.914769
rev_lr(best),59.724322,62.683891,63.12389,64.642502,70.02089
emo(ex),51.933581,54.122951,54.514419,56.190442,61.50342
emo(best),53.14727,55.661223,56.240599,58.143622,63.744405


In [16]:
result_dfs[3]

year_binned,2005-2007,2008-2010,2011,2012,2013,2014,2015,2016,2017
tick(all),51.026264,52.01541,54.698608,55.67673,55.18076,55.118223,53.690436,53.281379,53.672802
tick(ex),48.375169,48.691797,49.767564,50.405821,50.769938,50.625365,50.411381,50.187025,50.325442
quote(best),62.001982,65.179725,71.655296,72.864671,70.490409,59.696598,55.766747,59.737221,60.454299
quote(ex),60.240114,65.087735,72.667807,72.698523,68.546419,58.201278,55.357144,57.587809,56.36783
lr(ex),60.139402,64.920529,72.443605,72.626347,68.529786,58.224009,55.404737,57.639501,56.461656
lr(best),61.770175,65.02595,71.437559,72.738184,70.414053,59.724949,55.818385,59.76715,60.508873
rev_lr(ex),60.434483,65.243915,72.641338,72.753905,68.635869,58.340338,55.472789,57.717813,56.518642
rev_lr(best),62.105975,65.208141,71.598873,72.747441,70.471861,59.781302,55.868354,59.810037,60.508836
emo(ex),55.451914,53.662965,59.537719,62.244082,59.519146,53.849221,52.863794,54.267434,52.998396
emo(best),57.888492,55.119742,60.784914,63.976246,61.428633,55.216209,53.559065,56.083064,56.334069


In [17]:
result_dfs[4]

ttm_binned,ttm <= 1 month,ttm (1-2] month,ttm (2-3] month,ttm (3-6] month,ttm (6-12] month,ttm > 12 month
tick(all),53.332727,53.550037,53.522261,53.160915,52.851334,53.432381
tick(ex),49.625548,49.689669,49.607039,49.798354,49.656956,49.74948
quote(best),63.882513,65.397475,65.422539,64.184127,62.445047,57.988497
quote(ex),62.522034,64.400407,64.481152,63.436256,61.81123,57.829256
lr(ex),62.461965,64.315673,64.432409,63.354902,61.752208,57.852176
lr(best),63.808305,65.261791,65.313392,64.059836,62.355293,57.993314
rev_lr(ex),62.644623,64.498348,64.597881,63.564615,61.940079,58.018289
rev_lr(best),63.932338,65.404437,65.423339,64.204783,62.474696,58.103616
emo(ex),55.898716,56.551768,56.416862,55.416624,53.717634,50.779603
emo(best),57.663548,58.413786,58.156386,57.011923,55.141651,52.145511


In [18]:
result_dfs[5]

myn_binned,myn <= 0.7,myn (0.7-0.9],myn (0.9-1.1],myn (1.1-1.3],myn > 1.3
tick(all),51.880044,54.378771,53.662302,50.984724,50.797233
tick(ex),46.853104,49.644873,50.278239,48.376995,48.589495
quote(best),61.899357,66.351431,63.918702,60.28981,58.90849
quote(ex),60.873206,65.155983,62.748206,59.924671,58.317712
lr(ex),60.70827,65.103747,62.692581,59.856365,58.25679
lr(best),61.761158,66.257299,63.835151,60.203556,58.850379
rev_lr(ex),61.016323,65.27268,62.853195,60.059347,58.444755
rev_lr(best),61.937422,66.37157,63.955206,60.373272,58.973843
emo(ex),54.233186,57.734358,55.589143,52.183608,51.006457
emo(best),55.921048,59.600376,57.392796,53.2675,51.992379


In [19]:
result_dfs[6]

prox_q_binned,at mid,at quote,inside,outside
tick(all),48.354813,51.226827,54.17597,52.816735
tick(ex),47.028635,45.364142,50.711177,47.588104
quote(best),55.778173,58.965688,65.95007,54.334807
quote(ex),49.844272,58.646559,65.272531,53.869558
lr(ex),48.893222,58.646559,65.295629,53.869558
lr(best),54.967496,58.960312,65.922361,54.328355
rev_lr(ex),50.152927,58.646559,65.404763,53.869558
rev_lr(best),55.975015,58.998775,65.973706,54.369691
emo(ex),48.354813,58.646559,55.866623,53.759153
emo(best),53.357972,58.67837,57.551381,53.961556


In [20]:
print_keys = ["Option Type", "Security Type", "Trade Size", "Year", "Time to Maturity", "Moneyness","Location to Quote"]

master = pd.concat(result_dfs, axis=1, keys=print_keys).T

In [25]:
master.iloc[:,0:10]

Unnamed: 0,Unnamed: 1,tick(all),tick(ex),quote(best),quote(ex),lr(ex),lr(best),rev_lr(ex),rev_lr(best),emo(ex),emo(best)
Option Type,C,53.32064,49.406551,63.642245,62.68174,62.613399,63.551623,62.809223,63.6818,55.653509,57.300032
Option Type,P,53.305616,50.014733,63.813381,62.634438,62.585816,63.729395,62.756181,63.852476,55.126481,56.884877
Security Type,Index option,51.096852,49.946253,61.392291,60.56298,60.57225,61.358054,60.57509,61.425182,49.472916,49.643503
Security Type,Others,52.710833,49.032852,65.353648,64.425438,64.319831,65.247848,64.555439,65.394896,56.375943,58.145891
Security Type,Stock options,53.555241,49.873137,63.235512,62.135025,62.088789,63.152578,62.260435,63.274319,55.233955,56.933079
Trade Size,"(0,1]",52.009861,48.92774,59.648388,58.613592,58.588543,59.566481,58.773731,59.724322,51.933581,53.14727
Trade Size,"(1,3]",52.780268,49.606935,62.631963,61.605109,61.561729,62.529499,61.758749,62.683891,54.122951,55.661223
Trade Size,"(3,5]",52.576993,49.202498,63.057502,62.041579,61.954874,62.941639,62.207363,63.12389,54.514419,56.240599
Trade Size,"(5,11]",53.132934,49.559466,64.617288,63.588797,63.496833,64.523219,63.701634,64.642502,56.190442,58.143622
Trade Size,>11,56.388859,51.207259,70.050168,68.884684,68.811164,69.995727,68.914769,70.02089,61.50342,63.744405


In [26]:
master.iloc[:,11:-1]

Unnamed: 0,Unnamed: 1,rev_emo(best),clnv(ex),clnv(best),rev_clnv(ex),rev_clnv(best),trade_size(ex)->tick(all),trade_size(ex)->quote(best),trade_size(ex)->quote(best)->quote(ex),quote(best)->quote(ex)
Option Type,C,57.710848,58.296361,59.806855,58.789804,60.320531,61.275329,75.387336,75.513196,63.637924
Option Type,P,57.392221,57.765026,59.349557,58.35589,59.951271,59.689749,73.374947,73.526428,63.848081
Security Type,Index option,49.754436,51.568842,51.727618,51.710275,51.932591,55.542565,68.344624,68.437018,61.45
Security Type,Others,59.007814,59.196001,60.728547,60.013766,61.576776,59.577073,75.079283,75.306078,65.503757
Security Type,Stock options,57.258313,57.826586,59.398428,58.277447,59.859935,61.00548,74.438767,74.547253,63.202341
Trade Size,"(0,1]",53.884281,54.145997,55.439992,54.878553,56.223569,62.344979,74.599709,74.805963,59.739453
Trade Size,"(1,3]",56.333438,56.767994,58.25042,57.492726,58.97842,63.012207,77.945121,78.128294,62.680825
Trade Size,"(3,5]",56.986174,57.205062,58.797001,58.041222,59.570653,63.612356,79.651604,79.833781,63.064774
Trade Size,"(5,11]",58.520701,58.978964,60.661547,59.523393,61.159819,58.701591,72.38299,72.459196,64.585781
Trade Size,>11,63.443194,64.551995,66.324624,64.395782,66.268601,55.441527,69.423252,69.445714,69.964302


In [22]:
master.style.pipe(set_tex_style, caption=("master-short","master-long"), label=f"{key}-master")

## Effective Spread