In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

sys.path.append("..")
from otc.metrics.metrics import effective_spread

import wandb

from tqdm.auto import tqdm

from scipy.stats import wilcoxon
import warnings


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set here globally
exchange = "ise"
models = ["gbm", "classical"] # ["gbm", "classical"]
subset = "test" # "all"
strategy = "supervised"  # "transfer"


In [3]:
key = f"{exchange}_{strategy}_{subset}"
dataset = f"fbv/thesis/{exchange}_{strategy}_raw:latest"

os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)  # type: ignore
data_dir = artifact.download()

# load results
result_dirs = []
for model in models:
    results = f"fbv/thesis/{exchange}_{model}_{strategy}_{subset}:latest"
    artifact = run.use_artifact(results)  # type: ignore
    result_dir = artifact.download()
    result_dirs.append(result_dir)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact ise_supervised_raw:latest, 2589.45MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact ise_gbm_supervised_test:latest, 129.80MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact ise_classical_supervised_test:latest, 278.79MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0


In [4]:
# p. 35-38
columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "ask_ex",
    "bid_ex",
    "myn",
    "OPTION_TYPE",
    "issue_type",
]


if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )
    eval_data = pd.concat([train, val, test])
    del train, val, test

elif subset == "test":
    eval_data = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )


results = []
for i, model in tqdm(enumerate(models)):
    result = pd.read_parquet(Path(result_dirs[i], "results"), engine="fastparquet")
    result.columns = pd.MultiIndex.from_product([[model], result.columns])
    results.append(result)

results_data = pd.concat(results, axis=1, names=models)

assert len(eval_data) == len(results_data)

X_print = eval_data

del results


2it [00:02,  1.27s/it]


In [5]:
# FIXME: select a subset of results for testing.
results_data = results_data[
    [
        ("gbm", "gbm(classical)"),
        ("gbm",'gbm(classical-retraining)'),
        ("gbm",'gbm(classical-size-retraining)'), 
        ("gbm", "gbm(classical-size)"),
        ("gbm", "gbm(ml)"),
        ("gbm",'gbm(ml-retraining)'),
        ("gbm",'gbm(semi-classical)'), 
        # ("gbm",'gbm(semi-classical-retraining)'),
        ("classical", "tick(ex)"),
        ("classical", "quote(ex)"),
        ("classical", "lr(ex)"),
        ("classical", "emo(ex)"),
        ("classical", "clnv(ex)"),
        (
            "classical",
            "trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)",
        ),
    ]
]


In [6]:
results_data


Unnamed: 0_level_0,gbm,gbm,gbm,gbm,gbm,gbm,gbm,gbm,classical,classical,classical,classical,classical,classical
Unnamed: 0_level_1,gbm(classical),gbm(classical-retraining),gbm(classical-size-retraining),gbm(classical-size),gbm(ml),gbm(ml-retraining),gbm(semi-classical),gbm(semi-classical-retraining),tick(ex),quote(ex),lr(ex),emo(ex),clnv(ex),trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
39342191,-1,-1,-1,-1,-1,-1,-1,-1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342190,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342189,1,1,1,1,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0
39342188,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342187,-1,-1,-1,-1,-1,-1,-1,-1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49203742,-1,-1,1,1,-1,-1,-1,-1,-1.0,1.0,1.0,1.0,1.0,1.0
49203743,1,1,1,1,1,1,1,1,-1.0,1.0,1.0,-1.0,1.0,1.0
49203744,1,1,1,1,1,1,1,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
49203745,1,1,1,1,1,1,1,1,-1.0,1.0,1.0,1.0,1.0,1.0


### Robustness Checks

In [7]:
# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

# p. 38
bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

# p. 37
bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "<= 1",
    "(1-2]",
    "(2-3]",
    "(3-6]",
    "(6-12]",
    "> 12",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# Security type
# see 3.0a-mb-explanatory-data-analysis.ipynb
X_print["issue_type"] = X_print["issue_type"].map(
    {
        "0": "Stock option",
        "A": "Index option",
        "7": "Others",
        "F": "Others",
        "%": "Others",
        " ": "Others",
    }
)

# Moneyness p. 38
bins_myn = [-1, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "<= 0.7 c",
    "(0.7-0.9]",
    "(0.9-1.1]",
    "(1.1-1.3]",
    "> 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

# mid p. 31 + extra category for unknowns
ask = X_print["ask_ex"]
bid = X_print["bid_ex"]
trade_price = X_print["TRADE_PRICE"]

mid = np.where(ask >= bid, (ask + bid) * 0.5, np.nan)
half_spread = np.where(ask >= bid, (ask - bid) * 0.5, np.nan)

prox_quotes = np.where(
    trade_price == mid,
    0,  # at mid
    np.where(
        ((mid - half_spread) < trade_price) & (trade_price < (mid + half_spread)),
        1,  # inside
        np.where(
            (trade_price == (mid - half_spread)) | ((mid + half_spread) == trade_price),
            2,  # at quotes
            np.where(
                (trade_price < (mid - half_spread))
                | ((mid + half_spread) > trade_price),
                3,
                4,
            ),
        ),
    ),
)  # outside + unclassifiable

bins_prox = [-np.inf, 0, 1, 2, 3, 4]
prox_labels = [
    "at mid",
    "inside",
    "at quotes",
    "outside",
    "unknown",
]

X_print["prox_q_binned"] = pd.cut(prox_quotes, bins_prox, labels=prox_labels)
X_print["mid"] = mid

# clean up empty buckets, as it causes empty grouping in result set generation
X_print["year_binned"] = X_print["year_binned"].cat.remove_unused_categories()
X_print["myn_binned"] = X_print["myn_binned"].cat.remove_unused_categories()
X_print["ttm_binned"] = X_print["ttm_binned"].cat.remove_unused_categories()
X_print["prox_q_binned"] = X_print["prox_q_binned"].cat.remove_unused_categories()

X_print["all"] = "all"

X_print.drop(
    columns=[
        "EXPIRATION",
        "QUOTE_DATETIME",
        "TRADE_SIZE",
        "ttm",
        "myn",
        "ask_ex",
        "bid_ex",
        "year",
    ],
    inplace=True,
)


In [8]:
X_print.head(20)


Unnamed: 0_level_0,buy_sell,TRADE_PRICE,OPTION_TYPE,issue_type,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,prox_q_binned,mid,all
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
39342191,-1,3.5,P,Stock option,"(3,5]",2015,<= 1,(0.9-1.1],at quotes,3.675,all
39342190,-1,6.38,C,Stock option,"(0,1]",2015,<= 1,(0.9-1.1],inside,6.575,all
39342189,1,0.13,P,Others,"(0,1]",2015,<= 1,(0.7-0.9],at quotes,0.1,all
39342188,-1,0.04,C,Others,"(1,3]",2015,<= 1,(0.7-0.9],at quotes,0.07,all
39342187,-1,0.4,C,Others,"(3,5]",2015,<= 1,(0.9-1.1],inside,0.43,all
39342186,1,1.65,C,Others,"(3,5]",2015,(3-6],(0.9-1.1],inside,1.605,all
39342185,1,0.35,C,Stock option,"(0,1]",2015,<= 1,(0.7-0.9],unknown,,all
39342184,1,10.34,C,Stock option,>11,2015,<= 1,(1.1-1.3],inside,10.275,all
39342183,-1,15.17,C,Stock option,>11,2015,<= 1,(1.1-1.3],inside,15.275,all
39342182,-1,2.0,P,Others,"(0,1]",2015,(1-2],(1.1-1.3],inside,2.015,all


In [9]:
X_print = pd.concat([X_print, results_data], axis=1)


In [10]:
X_print.head()


Unnamed: 0_level_0,buy_sell,TRADE_PRICE,OPTION_TYPE,issue_type,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,prox_q_binned,mid,...,"(gbm, gbm(ml))","(gbm, gbm(ml-retraining))","(gbm, gbm(semi-classical))","(gbm, gbm(semi-classical-retraining))","(classical, tick(ex))","(classical, quote(ex))","(classical, lr(ex))","(classical, emo(ex))","(classical, clnv(ex))","(classical, trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all))"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39342191,-1,3.5,P,Stock option,"(3,5]",2015,<= 1,(0.9-1.1],at quotes,3.675,...,-1,-1,-1,-1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342190,-1,6.38,C,Stock option,"(0,1]",2015,<= 1,(0.9-1.1],inside,6.575,...,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342189,1,0.13,P,Others,"(0,1]",2015,<= 1,(0.7-0.9],at quotes,0.1,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0
39342188,-1,0.04,C,Others,"(1,3]",2015,<= 1,(0.7-0.9],at quotes,0.07,...,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342187,-1,0.4,C,Others,"(3,5]",2015,<= 1,(0.9-1.1],inside,0.43,...,-1,-1,-1,-1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## Results Set Generation

In [11]:
LUT = {
    "Trade_Size(ex)->Quote(Best)->Depth(Best)->Quote(Ex)->Depth(Ex)->Rev_Tick(All)": "\gls{GBM}",
    "(Ex)": " (Ex)",
    "(Best)": " (Best)",
    "(Classical)": " (Classical)",
    "(Classical-Size)": " (Classical, Size)",
    "Rev_": "Rev. ",
    "Trade_Size": "Trade Size",
    "Depth": "Depth",
    "->": " $\\to$ ",
    "Lr": "\gls{LR}",
    "Emo": "\gls{EMO}",
    "Clnv": "\gls{CLNV}",
    "OPTION_TYPE": "Option Type",
    "_": "$\_",
    "Gbm": "\gls{GBM}",
}

LUT_INDEX = {
    "OPTION_TYPE": "Option Type",
    "issue_type": "Security Type",
    "TRADE_SIZE_binned": "Trade Size",
    "year_binned": "Year",
    "ttm_binned": "Time to Maturity",
    "myn_binned": "Moneyness",
    "prox_q_binned": "Location to Quote",
    "all":"All trades",
}


def cell_str(x):
    x = x.title()
    for orig, sub in LUT.items():
        x = x.replace(orig, sub)
    # title-case everything
    return x


def highlight_max(s, props=""):
    return np.where(s == np.nanmax(s.values), props, "")


In [12]:
def set_tex_style(styler, caption, label, bold_axis=1):
    res = styler.set_caption(caption)
        

    res = (res.apply(
            highlight_max, props="font-weight:bold;", axis=bold_axis
        )
        .format(precision=4, decimal=".", thousands=",", escape=False, hyperlinks=None)
        .format_index(cell_str, axis=0)
        .format_index(cell_str, axis=1)
        .to_latex(
            f"{label}.tex",
            siunitx=True,
            position_float="centering",
            hrules=True,
            clines="skip-last;data",
            label="tab:" + label,
            caption=caption,
            convert_css=True,
        )
    )
    return res


In [13]:
classifiers = results_data.columns.tolist()
criterions = list(LUT_INDEX)


In [14]:
results_data.columns.tolist()


[('gbm', 'gbm(classical)'),
 ('gbm', 'gbm(classical-retraining)'),
 ('gbm', 'gbm(classical-size-retraining)'),
 ('gbm', 'gbm(classical-size)'),
 ('gbm', 'gbm(ml)'),
 ('gbm', 'gbm(ml-retraining)'),
 ('gbm', 'gbm(semi-classical)'),
 ('gbm', 'gbm(semi-classical-retraining)'),
 ('classical', 'tick(ex)'),
 ('classical', 'quote(ex)'),
 ('classical', 'lr(ex)'),
 ('classical', 'emo(ex)'),
 ('classical', 'clnv(ex)'),
 ('classical',
  'trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)')]

In [15]:
X_print.head()

Unnamed: 0_level_0,buy_sell,TRADE_PRICE,OPTION_TYPE,issue_type,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,prox_q_binned,mid,...,"(gbm, gbm(ml))","(gbm, gbm(ml-retraining))","(gbm, gbm(semi-classical))","(gbm, gbm(semi-classical-retraining))","(classical, tick(ex))","(classical, quote(ex))","(classical, lr(ex))","(classical, emo(ex))","(classical, clnv(ex))","(classical, trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all))"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39342191,-1,3.5,P,Stock option,"(3,5]",2015,<= 1,(0.9-1.1],at quotes,3.675,...,-1,-1,-1,-1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342190,-1,6.38,C,Stock option,"(0,1]",2015,<= 1,(0.9-1.1],inside,6.575,...,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342189,1,0.13,P,Others,"(0,1]",2015,<= 1,(0.7-0.9],at quotes,0.1,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0
39342188,-1,0.04,C,Others,"(1,3]",2015,<= 1,(0.7-0.9],at quotes,0.07,...,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342187,-1,0.4,C,Others,"(3,5]",2015,<= 1,(0.9-1.1],inside,0.43,...,-1,-1,-1,-1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [16]:
X_print[X_print["prox_q_binned"]=="unknown"]

Unnamed: 0_level_0,buy_sell,TRADE_PRICE,OPTION_TYPE,issue_type,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,prox_q_binned,mid,...,"(gbm, gbm(ml))","(gbm, gbm(ml-retraining))","(gbm, gbm(semi-classical))","(gbm, gbm(semi-classical-retraining))","(classical, tick(ex))","(classical, quote(ex))","(classical, lr(ex))","(classical, emo(ex))","(classical, clnv(ex))","(classical, trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all))"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39342185,1,0.35,C,Stock option,"(0,1]",2015,<= 1,(0.7-0.9],unknown,,...,1,1,1,1,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
39342179,1,11.30,C,Stock option,"(5,11]",2015,> 12,> 1.3,unknown,,...,1,1,1,1,1.0,-1.0,1.0,1.0,1.0,1.0
39342234,1,0.41,C,Others,"(5,11]",2015,<= 1,(0.9-1.1],unknown,,...,1,1,1,1,1.0,-1.0,-1.0,-1.0,-1.0,1.0
39342231,-1,0.68,P,Stock option,"(3,5]",2015,<= 1,(0.7-0.9],unknown,,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0
39342225,-1,0.03,C,Others,"(5,11]",2015,<= 1,(0.9-1.1],unknown,,...,-1,-1,-1,-1,-1.0,-1.0,1.0,1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49202809,-1,1.35,C,Stock option,"(1,3]",2017,(3-6],(0.9-1.1],unknown,1.300,...,-1,-1,1,-1,-1.0,1.0,1.0,1.0,1.0,-1.0
49202810,-1,1.35,C,Stock option,"(0,1]",2017,(3-6],(0.9-1.1],unknown,1.300,...,-1,-1,1,-1,-1.0,1.0,1.0,1.0,1.0,-1.0
49203153,1,0.21,C,Others,"(1,3]",2017,(1-2],(0.9-1.1],unknown,0.200,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0
49203246,1,0.21,C,Others,"(1,3]",2017,(1-2],(0.9-1.1],unknown,0.200,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
X_print.head().T

index,39342191,39342190,39342189,39342188,39342187
buy_sell,-1,-1,1,-1,-1
TRADE_PRICE,3.5,6.38,0.13,0.04,0.4
OPTION_TYPE,P,C,P,C,C
issue_type,Stock option,Stock option,Others,Others,Others
TRADE_SIZE_binned,"(3,5]","(0,1]","(0,1]","(1,3]","(3,5]"
year_binned,2015,2015,2015,2015,2015
ttm_binned,<= 1,<= 1,<= 1,<= 1,<= 1
myn_binned,(0.9-1.1],(0.9-1.1],(0.7-0.9],(0.7-0.9],(0.9-1.1]
prox_q_binned,at quotes,inside,at quotes,at quotes,inside
mid,3.675,6.575,0.1,0.07,0.43


## Accuracy Calculation

In [18]:
# FIXME: Find better approach
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

result_dfs = []

for criterion in tqdm(criterions):
    results = []
    for classifier in tqdm(classifiers):
        res = (
            X_print.groupby([criterion])[["buy_sell", classifier]]
            .apply(
                lambda x: accuracy_score(x["buy_sell"].astype("int8"), x[classifier])
            )
            .mul(100)
            .rename(classifier)
        )
#         acc_tot = accuracy_score(
#             X_print["buy_sell"].astype("int8"), X_print[classifier]
#         )

#         res.loc["all"] = acc_tot * 100

        res.index.name = LUT_INDEX.get(criterion)
        results.append(res)

    # save aggregated results
    result_df = pd.concat(results, axis=1).T
    result_df.style.pipe(
        set_tex_style,
        caption=(f"long-tbd", "short-tbd"),
        label=f"{key.lower()}-{criterion.lower()}",
    )

    # store all result sets for later use
    result_dfs.append(result_df)


  0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:01<00:14,  1.14s/it][A
 14%|█▍        | 2/14 [00:02<00:13,  1.14s/it][A
 21%|██▏       | 3/14 [00:03<00:12,  1.14s/it][A
 29%|██▊       | 4/14 [00:04<00:11,  1.14s/it][A
 36%|███▌      | 5/14 [00:05<00:10,  1.14s/it][A
 43%|████▎     | 6/14 [00:06<00:09,  1.14s/it][A
 50%|█████     | 7/14 [00:07<00:07,  1.14s/it][A
 57%|█████▋    | 8/14 [00:09<00:06,  1.14s/it][A
 64%|██████▍   | 9/14 [00:10<00:06,  1.25s/it][A
 71%|███████▏  | 10/14 [00:12<00:05,  1.32s/it][A
 79%|███████▊  | 11/14 [00:13<00:04,  1.36s/it][A
 86%|████████▌ | 12/14 [00:14<00:02,  1.39s/it][A
 93%|█████████▎| 13/14 [00:16<00:01,  1.42s/it][A
100%|██████████| 14/14 [00:17<00:00,  1.28s/it][A
 12%|█▎        | 1/8 [00:21<02:32, 21.82s/it]
  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:01<00:20,  1.59s/it][A
 14%|█▍        | 2/14 [00:03<00:18,  1.58s/it][A
 21%|██▏       | 3/14 [00:0

In [19]:
master = pd.concat(result_dfs, axis=1, keys=list(LUT_INDEX.values())).T


In [20]:
master


Unnamed: 0_level_0,Unnamed: 1_level_0,gbm,gbm,gbm,gbm,gbm,gbm,gbm,gbm,classical,classical,classical,classical,classical,classical
Unnamed: 0_level_1,Unnamed: 1_level_1,gbm(classical),gbm(classical-retraining),gbm(classical-size-retraining),gbm(classical-size),gbm(ml),gbm(ml-retraining),gbm(semi-classical),gbm(semi-classical-retraining),tick(ex),quote(ex),lr(ex),emo(ex),clnv(ex),trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)
Option Type,C,62.889592,65.781794,73.577088,71.895259,73.62473,75.738969,62.672208,65.810322,50.367984,56.342879,56.41846,53.50064,54.337599,66.851998
Option Type,P,64.544991,67.112672,74.375973,72.892594,74.593715,76.574748,64.306178,67.110782,50.113769,57.734551,57.799131,54.118041,55.226736,67.266573
Security Type,Index option,56.308184,56.614402,58.684208,57.640799,58.100126,60.379748,56.096478,56.683395,50.895498,53.731795,53.785666,51.27071,51.730037,58.515977
Security Type,Others,68.40553,70.200752,77.549609,76.385942,77.551217,79.554515,68.13018,70.226882,49.87573,62.398603,62.49744,57.504691,59.062633,69.879862
Security Type,Stock option,61.866828,65.027478,72.731024,70.962294,72.919301,74.987897,61.657881,65.036218,50.390021,54.86553,54.924781,52.33034,53.062516,66.029836
Trade Size,"(0,1]",61.886324,64.923233,73.782177,72.589837,74.242395,75.832485,61.677012,64.921102,50.303092,55.284462,55.422436,51.937941,52.965069,68.580024
Trade Size,"(1,3]",62.455088,65.420139,74.408553,73.006333,74.813673,76.620021,62.244914,65.436644,50.221643,55.309816,55.395938,51.945084,52.858331,68.657697
Trade Size,"(3,5]",62.683294,65.649639,74.374525,72.76689,74.557255,76.622135,62.461951,65.681205,49.652967,55.737893,55.769153,52.809933,53.602373,68.82461
Trade Size,"(5,11]",65.941513,68.272577,73.69041,71.771961,73.383853,75.96589,65.679505,68.288259,50.19077,59.868862,59.901621,57.160265,57.974111,63.200912
Trade Size,>11,67.305363,69.386769,73.596321,71.307368,73.084679,75.832389,67.050294,69.416185,50.750074,60.713971,60.697759,57.238314,58.454957,64.426


In [21]:
master.style.pipe(
    set_tex_style, caption=("master-long", "master-short"), label=f"{key}-master", bold_axis=0
)


## Effective Spread 🚧

In [22]:
classifiers


[('gbm', 'gbm(classical)'),
 ('gbm', 'gbm(classical-retraining)'),
 ('gbm', 'gbm(classical-size-retraining)'),
 ('gbm', 'gbm(classical-size)'),
 ('gbm', 'gbm(ml)'),
 ('gbm', 'gbm(ml-retraining)'),
 ('gbm', 'gbm(semi-classical)'),
 ('gbm', 'gbm(semi-classical-retraining)'),
 ('classical', 'tick(ex)'),
 ('classical', 'quote(ex)'),
 ('classical', 'lr(ex)'),
 ('classical', 'emo(ex)'),
 ('classical', 'clnv(ex)'),
 ('classical',
  'trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)')]

In [23]:
eff_dfs = []


def stats(x, classifier):

    nom = effective_spread(x[classifier], x["TRADE_PRICE"], x["mid"], mode="nominal")
    rel = (
        effective_spread(x[classifier], x["TRADE_PRICE"], x["mid"], mode="relative")
        * 100
    )

    # eff_spread_pred = effective_spread(x[classifier], x["TRADE_PRICE"], x["mid"], mode="none")
    # eff_spread_true = effective_spread(x["buy_sell"], x["TRADE_PRICE"], x["mid"], mode="none")
    # wilcoxon_res  = wilcoxon(eff_spread_pred, eff_spread_true, nan_policy="omit", zero_method="zsplit")

    return pd.Series(
        {
            "nominal": nom,
            "rel": rel,
            # 'statistic':wilcoxon_res.statistic,
            # 'pvalue':wilcoxon_res.pvalue
        }
    )


for criterion in tqdm(criterions):
    results = []

    for classifier in tqdm(classifiers):
        res = X_print.groupby([criterion])[
            ["TRADE_PRICE", "mid", classifier, "buy_sell"]
        ].apply(stats, classifier)
        results.append(res)

    # save aggregated results
    result_df = pd.concat(results, axis=1, keys=classifiers).T
    result_df.style.pipe(
        set_tex_style,
        caption=(f"long-tbd", "short-tbd"),
        label=f"{key.lower()}-{criterion.lower()}-eff-spread",
    )

    # store all result sets for later use
    eff_dfs.append(result_df)


  0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:00<00:09,  1.41it/s][A
 14%|█▍        | 2/14 [00:01<00:08,  1.42it/s][A
 21%|██▏       | 3/14 [00:02<00:07,  1.42it/s][A
 29%|██▊       | 4/14 [00:02<00:07,  1.42it/s][A
 36%|███▌      | 5/14 [00:03<00:06,  1.42it/s][A
 43%|████▎     | 6/14 [00:04<00:05,  1.43it/s][A
 50%|█████     | 7/14 [00:04<00:04,  1.43it/s][A
 57%|█████▋    | 8/14 [00:05<00:04,  1.43it/s][A
 64%|██████▍   | 9/14 [00:06<00:03,  1.44it/s][A
 71%|███████▏  | 10/14 [00:06<00:02,  1.45it/s][A
 79%|███████▊  | 11/14 [00:07<00:02,  1.45it/s][A
 86%|████████▌ | 12/14 [00:08<00:01,  1.45it/s][A
 93%|█████████▎| 13/14 [00:09<00:00,  1.45it/s][A
100%|██████████| 14/14 [00:09<00:00,  1.44it/s][A
 12%|█▎        | 1/8 [00:09<01:08,  9.75s/it]
  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:01<00:14,  1.15s/it][A
 14%|█▍        | 2/14 [00:02<00:13,  1.14s/it][A
 21%|██▏       | 3/14 [00:0

In [24]:
master_eff_spread = pd.concat(eff_dfs, axis=1, keys=LUT_INDEX.values()).T


In [25]:
master_eff_spread


Unnamed: 0_level_0,Unnamed: 1_level_0,gbm,gbm,gbm,gbm,gbm,gbm,gbm,gbm,gbm,gbm,...,classical,classical,classical,classical,classical,classical,classical,classical,classical,classical
Unnamed: 0_level_1,Unnamed: 1_level_1,gbm(classical),gbm(classical),gbm(classical-retraining),gbm(classical-retraining),gbm(classical-size-retraining),gbm(classical-size-retraining),gbm(classical-size),gbm(classical-size),gbm(ml),gbm(ml),...,quote(ex),quote(ex),lr(ex),lr(ex),emo(ex),emo(ex),clnv(ex),clnv(ex),trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all),trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)
Unnamed: 0_level_2,Unnamed: 1_level_2,nominal,rel,nominal,rel,nominal,rel,nominal,rel,nominal,rel,...,nominal,rel,nominal,rel,nominal,rel,nominal,rel,nominal,rel
Option Type,C,0.07902,8.840429,0.037355,6.206095,0.028276,5.316936,0.045046,6.70231,0.041781,6.126333,...,0.171287,15.89253,0.171287,15.89253,0.050367,8.205606,0.122153,13.0326,0.011159,3.941527
Option Type,P,0.068563,9.776834e+302,0.031439,9.776834e+302,0.023823,9.776834e+302,0.039538,9.776834e+302,0.032412,9.776834e+302,...,0.154249,9.776834e+302,0.154249,9.776834e+302,0.042093,9.776834e+302,0.109489,9.776834e+302,0.010753,9.776834e+302
Security Type,Index option,0.242608,2.844856,0.188044,1.865985,0.10272,0.311617,0.111968,0.562651,-0.030354,-1.031508,...,0.56371,7.273357,0.56371,7.273357,0.061348,1.705859,0.1821,4.122164,0.078758,1.617452
Security Type,Others,0.049453,12.65646,0.022307,10.77755,0.018159,9.881508,0.030849,10.92779,0.026782,10.18603,...,0.104831,17.44588,0.104831,17.44588,0.03104,11.80383,0.077537,15.14082,0.008131,6.266844
Security Type,Stock option,0.081498,3.234859e+302,0.037189,3.234859e+302,0.028246,3.234859e+302,0.046076,3.234859e+302,0.042696,3.234859e+302,...,0.180747,3.234859e+302,0.180747,3.234859e+302,0.052471,3.234859e+302,0.130781,3.234859e+302,0.011057,3.234859e+302
Trade Size,"(0,1]",0.087384,7.953118,0.041896,5.401829,0.0207,3.781112,0.038181,4.880296,0.03097,4.579393,...,0.187245,14.92213,0.187245,14.92213,0.051391,7.014344,0.128567,11.9349,0.013314,4.254477
Trade Size,"(1,3]",0.077321,8.345212,0.035934,5.696724,0.024812,4.230836,0.041989,5.753455,0.036798,5.330154,...,0.165866,15.43733,0.165866,15.43733,0.047579,7.523678,0.116841,12.3804,0.014517,4.537996
Trade Size,"(3,5]",0.0716,8.613399,0.033431,5.913642,0.026536,4.4907,0.04108,5.912364,0.036031,5.545709,...,0.158946,15.85658,0.158946,15.85658,0.043093,7.814296,0.113159,12.77355,0.008509,4.49794
Trade Size,"(5,11]",0.062259,11.36177,0.027265,8.636587,0.030305,8.724218,0.044524,9.965791,0.041767,9.298397,...,0.148602,18.73145,0.148602,18.73145,0.044677,10.96493,0.110004,15.78269,0.008261,3.032197
Trade Size,>11,0.057662,10.96564,0.026759,8.445091,0.034656,8.758865,0.050978,10.62914,0.047933,9.870423,...,0.130306,17.54531,0.130306,17.54531,0.039798,10.39769,0.099363,14.94917,0.006325,2.690303


In [26]:
master_eff_spread.style.pipe(
    set_tex_style,
    caption=("master-short", "master-long"),
    label=f"{key}-master-eff-spread",
    bold_axis=0,
)


## Change in Parenthesis 🅾️

In [27]:
master

Unnamed: 0_level_0,Unnamed: 1_level_0,gbm,gbm,gbm,gbm,gbm,gbm,gbm,gbm,classical,classical,classical,classical,classical,classical
Unnamed: 0_level_1,Unnamed: 1_level_1,gbm(classical),gbm(classical-retraining),gbm(classical-size-retraining),gbm(classical-size),gbm(ml),gbm(ml-retraining),gbm(semi-classical),gbm(semi-classical-retraining),tick(ex),quote(ex),lr(ex),emo(ex),clnv(ex),trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)
Option Type,C,62.889592,65.781794,73.577088,71.895259,73.62473,75.738969,62.672208,65.810322,50.367984,56.342879,56.41846,53.50064,54.337599,66.851998
Option Type,P,64.544991,67.112672,74.375973,72.892594,74.593715,76.574748,64.306178,67.110782,50.113769,57.734551,57.799131,54.118041,55.226736,67.266573
Security Type,Index option,56.308184,56.614402,58.684208,57.640799,58.100126,60.379748,56.096478,56.683395,50.895498,53.731795,53.785666,51.27071,51.730037,58.515977
Security Type,Others,68.40553,70.200752,77.549609,76.385942,77.551217,79.554515,68.13018,70.226882,49.87573,62.398603,62.49744,57.504691,59.062633,69.879862
Security Type,Stock option,61.866828,65.027478,72.731024,70.962294,72.919301,74.987897,61.657881,65.036218,50.390021,54.86553,54.924781,52.33034,53.062516,66.029836
Trade Size,"(0,1]",61.886324,64.923233,73.782177,72.589837,74.242395,75.832485,61.677012,64.921102,50.303092,55.284462,55.422436,51.937941,52.965069,68.580024
Trade Size,"(1,3]",62.455088,65.420139,74.408553,73.006333,74.813673,76.620021,62.244914,65.436644,50.221643,55.309816,55.395938,51.945084,52.858331,68.657697
Trade Size,"(3,5]",62.683294,65.649639,74.374525,72.76689,74.557255,76.622135,62.461951,65.681205,49.652967,55.737893,55.769153,52.809933,53.602373,68.82461
Trade Size,"(5,11]",65.941513,68.272577,73.69041,71.771961,73.383853,75.96589,65.679505,68.288259,50.19077,59.868862,59.901621,57.160265,57.974111,63.200912
Trade Size,>11,67.305363,69.386769,73.596321,71.307368,73.084679,75.832389,67.050294,69.416185,50.750074,60.713971,60.697759,57.238314,58.454957,64.426


In [28]:
base = master[[("classical","quote(ex)"),("classical", "trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)"), ("classical", "trade_size(ex)->quote(best)->depth(best)->quote(ex)->depth(ex)->rev_tick(all)")]]
revised = master[[("gbm","gbm(classical)"),("gbm","gbm(classical-size)"), ("gbm", "gbm(ml)")]]

In [34]:

def combine_results(revised: pd.DataFrame, base: pd.DataFrame) -> pd.DataFrame:
    """
    Generate print layout like in Grauer et al.

    https://tex.stackexchange.com/questions/430283/table-with-numbers-in-parentheses-in-siunitx/430290#430290

    # see p. https://texdoc.org/serve/siunitx/0
    """
    # first, second layer of colum index
    c_1 = revised.columns.get_level_values(1)
    c_2 = ["nom"]
    midx = pd.MultiIndex.from_product([c_1, c_2])

    # copy data from revised add as (column, "nom")
    combo = pd.DataFrame(revised.values, index=revised.index, columns=midx)

    
    
    for i, mul_col in enumerate(combo.columns):

        combo[(mul_col[0], "pm")] = (combo[mul_col] - base.iloc[:,i]).round(2)
        combo.sort_index(axis=1, inplace=True)
        
    return combo


In [35]:
diff = combine_results(revised, base)

diff.style.to_latex(
            f"diff-{key}.tex",
            siunitx=True,
            position_float="centering",
            hrules=True,
            clines="skip-last;data",
            label=f"tab:diff-{key}",
            caption=(f"long-diff-{key}", f"short-diff-{key}"),
            convert_css=True,
        )

In [36]:
diff


Unnamed: 0_level_0,Unnamed: 1_level_0,gbm(classical),gbm(classical),gbm(classical-size),gbm(classical-size),gbm(ml),gbm(ml)
Unnamed: 0_level_1,Unnamed: 1_level_1,nom,pm,nom,pm,nom,pm
Option Type,C,62.889592,6.55,71.895259,5.04,73.62473,6.77
Option Type,P,64.544991,6.81,72.892594,5.63,74.593715,7.33
Security Type,Index option,56.308184,2.58,57.640799,-0.88,58.100126,-0.42
Security Type,Others,68.40553,6.01,76.385942,6.51,77.551217,7.67
Security Type,Stock option,61.866828,7.0,70.962294,4.93,72.919301,6.89
Trade Size,"(0,1]",61.886324,6.6,72.589837,4.01,74.242395,5.66
Trade Size,"(1,3]",62.455088,7.15,73.006333,4.35,74.813673,6.16
Trade Size,"(3,5]",62.683294,6.95,72.76689,3.94,74.557255,5.73
Trade Size,"(5,11]",65.941513,6.07,71.771961,8.57,73.383853,10.18
Trade Size,>11,67.305363,6.59,71.307368,6.88,73.084679,8.66
