In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

sys.path.append("..")
from otc.metrics.metrics import effective_spread

import wandb

from tqdm.notebook import tqdm


In [2]:
# set here globally
exchange = "ise"
models = "gbm"
subset = "test" # "all"
strategy = "supervised"

key = f"{exchange}_{models}_{strategy}_{subset}"

In [3]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

dataset = f"fbv/thesis/{exchange}_{strategy}_unscaled:latest"
results = f"fbv/thesis/{key}:latest"

# load unscaled data
artifact = run.use_artifact(dataset) # type: ignore
data_dir = artifact.download()

# load results
artifact = run.use_artifact(results) # type: ignore
results_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact ise_supervised_unscaled:latest, 3391.53MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [4]:
# p. 35-38
columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "ask_ex",
    "bid_ex",
    "myn",
    "OPTION_TYPE",
    "issue_type"
]


if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set_extended_60"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set_extended_20"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set_extended_20"), engine="fastparquet", columns=columns
    )
    eval_data = pd.concat([train,val,test])
    del train, val, test
    
elif subset == "test":
    eval_data = pd.read_parquet(
        Path(data_dir, "test_set_extended_20"), engine="fastparquet", columns=columns
    )

results_data = pd.read_parquet(
    Path(results_dir, "results"), engine="fastparquet"
)


assert len(eval_data) == len(results_data)

X_print = eval_data


### Robustness Checks

In [5]:
X_print.head()

Unnamed: 0,buy_sell,EXPIRATION,QUOTE_DATETIME,TRADE_SIZE,TRADE_PRICE,ask_ex,bid_ex,myn,OPTION_TYPE,issue_type
39342171,-1,2015-12-18,2015-11-06 09:30:00,2,0.52,0.6,0.52,0.921659,P,%
39342172,-1,2015-11-27,2015-11-06 09:30:00,1,7.82,8.15,7.6,1.001696,C,0
39342173,-1,2017-01-20,2015-11-06 09:30:00,1,28.889999,32.049999,28.799999,0.886115,C,0
39342174,1,2015-11-20,2015-11-06 09:30:00,1,2.25,2.25,1.85,1.009261,C,%
39342175,-1,2015-12-18,2015-11-06 09:30:00,1,1.7,1.95,1.7,0.99188,C,%


In [6]:

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

# p. 38
bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

# p. 37
bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# Security type
# see 3.0a-mb-explanatory-data-analysis.ipynb
X_print["issue_type"] = X_print["issue_type"].map({'0': 'Stock options', 'A': 'Index option', '7': 'Others', 'F':'Others', '%': 'Others', ' ': 'Others'})

# Moneyness p. 38
bins_myn = [-1, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "myn <= 0.7",
    "myn (0.7-0.9]",
    "myn (0.9-1.1]",
    "myn (1.1-1.3]",
    "myn > 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

# proximity to quotes
mid_ex = 0.5 * (X_print["ask_ex"] + X_print["bid_ex"])
spread_ex = X_print["ask_ex"] - X_print["bid_ex"]
X_print["prox_q"] = (X_print["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
X_print["mid"] = mid_ex

# p. 31
def map_quotes(x):
    abs_x = np.abs(x)
    if 1 < x < np.inf:
        return "outside"
    elif x == 0:
        return "at mid"
    elif x == 1:
        return "at quote"
    return "inside"
    
X_print["prox_q_binned"] = X_print["prox_q"].apply(map_quotes)

# clean up empty buckets, as it causes empty grouping in result set generatio
X_print["year_binned"] = X_print["year_binned"].cat.remove_unused_categories()
X_print["myn_binned"] = X_print["myn_binned"].cat.remove_unused_categories()
X_print["ttm_binned"] = X_print["ttm_binned"].cat.remove_unused_categories()


X_print.drop(columns=["EXPIRATION","QUOTE_DATETIME", "TRADE_SIZE", "ttm", "myn", "prox_q", "ask_ex", "bid_ex", "year"], inplace=True)


In [7]:
X_print.head(20)

Unnamed: 0,buy_sell,TRADE_PRICE,OPTION_TYPE,issue_type,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,mid,prox_q_binned
39342171,-1,0.52,P,Others,"(1,3]",2015,ttm <= 1 month,myn (0.9-1.1],0.56,inside
39342172,-1,7.82,C,Stock options,"(0,1]",2015,ttm <= 1 month,myn (0.9-1.1],7.875,inside
39342173,-1,28.889999,C,Stock options,"(0,1]",2015,ttm > 12 month,myn (0.7-0.9],30.424999,inside
39342174,1,2.25,C,Others,"(0,1]",2015,ttm <= 1 month,myn (0.9-1.1],2.05,outside
39342175,-1,1.7,C,Others,"(0,1]",2015,ttm <= 1 month,myn (0.9-1.1],1.825,inside
39342176,1,0.35,P,Others,"(3,5]",2015,ttm <= 1 month,myn (0.9-1.1],0.28,inside
39342177,1,0.44,P,Others,"(3,5]",2015,ttm <= 1 month,myn (0.9-1.1],0.36,inside
39342178,-1,0.72,P,Others,"(3,5]",2015,ttm <= 1 month,myn (0.9-1.1],0.81,inside
39342179,1,11.3,C,Stock options,"(5,11]",2015,ttm > 12 month,myn > 1.3,,inside
39342180,1,2.19,P,Stock options,"(1,3]",2015,ttm > 12 month,myn (0.7-0.9],1.98,inside


In [8]:
X_print = pd.concat([X_print, results_data], axis=1)

## Results Set Generation

In [9]:
LUT = {"(ex)": " (ex)", 
       "(best)": " (best)",
       "rev_": "Rev. ",
       "tick": "Tick",
       "quote": "Quote",
       "trade_size": "Trade Size",
       "depth": "Depth",
       "->": " $\\to$ ",
       "lr": "\gls{LR}",
       "emo": "\gls{EMO}", 
       "clnv": "\gls{CLNV}",
      "OPTION_TYPE": "Option Type",
      "(":"$(", # put interval start in math env
      "]":"]$", # put interval end in math env
      "_":"$\_"
      }
       
def cell_str(x):
    for orig, sub in LUT.items():
        x = x.replace(orig,sub)
    return x

In [10]:
def set_tex_style(styler, caption, label):
    res = (
    styler.set_caption(caption)
    # .hide(axis="index")
    .format_index(cell_str, axis = 0)
    .format_index(cell_str, axis = 1)
    .format(precision=4, decimal='.', thousands=",", escape=False, hyperlinks=None)
    .to_latex(f"{label}.tex", siunitx=True, position_float="centering", hrules=True, clines="skip-last;data",
                      label="tab:"+label, caption=caption)
             )
    return res

In [11]:
classifiers = results_data.columns.tolist()
criterions = ["OPTION_TYPE", "issue_type", "TRADE_SIZE_binned", "year_binned","ttm_binned","myn_binned","prox_q_binned"]

## Accurcay Calculation

In [12]:
result_dfs = []

for criterion in tqdm(criterions):
    results = []
    for classifier in tqdm(classifiers):
        acc_tot = accuracy_score(X_print["buy_sell"], X_print[classifier])
        res = (
            X_print.groupby([criterion])[["buy_sell", classifier]]
            .apply(lambda x: accuracy_score(x["buy_sell"], x[classifier]))
            .mul(100)
            .rename(classifier)
            )
        res.loc["all"] = acc_tot * 100
        results.append(res)

    # save aggregated results
    result_df = pd.concat(results, axis=1).T
    result_df.style.pipe(set_tex_style, caption=(f"long-tbd","short-tbd"), label=f"{key.lower()}-{criterion.lower()}")

    # store all result sets for later use
    result_dfs.append(result_df)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
result_dfs[0]

OPTION_TYPE,C,P,all
classical-size,71.871219,72.845956,72.326249


In [14]:
result_dfs[1]

issue_type,Index option,Others,Stock options,all
classical-size,57.421532,76.345621,70.932783,72.326249


In [15]:
result_dfs[2]

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11,all
classical-size,72.534139,72.950857,72.748349,71.768243,71.297236,72.326249


In [16]:
result_dfs[3]

year_binned,2015,2016,2017,all
classical-size,69.011751,72.517449,72.998745,72.326249


In [17]:
result_dfs[4]

ttm_binned,ttm <= 1 month,ttm (1-2] month,ttm (2-3] month,ttm (3-6] month,ttm (6-12] month,ttm > 12 month,all
classical-size,72.732527,72.815921,71.803625,71.275752,71.226768,68.59547,72.326249


In [18]:
result_dfs[5]

myn_binned,myn <= 0.7,myn (0.7-0.9],myn (0.9-1.1],myn (1.1-1.3],myn > 1.3,all
classical-size,71.865108,74.258422,72.949282,66.248435,63.042252,72.326249


In [19]:
result_dfs[6]

prox_q_binned,at mid,at quote,inside,outside,all
classical-size,72.120396,86.092666,70.939826,87.596314,72.326249


In [20]:
print_keys = ["Option Type", "Security Type", "Trade Size", "Year", "Time to Maturity", "Moneyness","Location to Quote"]

master = pd.concat(result_dfs, axis=1, keys=print_keys).T

In [21]:
master.iloc[:,0:10]

Unnamed: 0,Unnamed: 1,classical-size
Option Type,C,71.871219
Option Type,P,72.845956
Option Type,all,72.326249
Security Type,Index option,57.421532
Security Type,Others,76.345621
Security Type,Stock options,70.932783
Security Type,all,72.326249
Trade Size,"(0,1]",72.534139
Trade Size,"(1,3]",72.950857
Trade Size,"(3,5]",72.748349


In [22]:
master.iloc[:,11:-1]

Unnamed: 0,Unnamed: 1
Option Type,C
Option Type,P
Option Type,all
Security Type,Index option
Security Type,Others
Security Type,Stock options
Security Type,all
Trade Size,"(0,1]"
Trade Size,"(1,3]"
Trade Size,"(3,5]"


In [23]:
master.style.pipe(set_tex_style, caption=("master-short","master-long"), label=f"{key}-master")

## Effective Spread

In [24]:
eff_dfs = []

classifiers.extend(["buy_sell"])

for criterion in tqdm(criterions):
    results = []
    for classifier in tqdm(classifiers):
        res = (X_print.groupby([criterion])[["TRADE_PRICE", "mid", classifier]]
            .apply(lambda x: pd.Series({'nominal':  effective_spread(x[classifier], x["TRADE_PRICE"], x["mid"], mode="nominal"), 
                                        'rel': effective_spread(x[classifier], x["TRADE_PRICE"], x["mid"], mode="relative") * 100}) 
            ))
        results.append(res)

    # save aggregated results
    result_df = pd.concat(results, axis=1, keys=classifiers).T
    result_df.style.pipe(set_tex_style, caption=(f"long-tbd","short-tbd"), label=f"{key.lower()}-{criterion.lower()}-eff-spread")

    # store all result sets for later use
    eff_dfs.append(result_df)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
eff_dfs[0]

Unnamed: 0,OPTION_TYPE,C,P
classical-size,nominal,0.045034,0.03936
classical-size,rel,6.663335,7.277869
buy_sell,nominal,0.004681,0.005291
buy_sell,rel,3.207661,4.310417


In [26]:
eff_dfs[1]

Unnamed: 0,issue_type,Index option,Others,Stock options
classical-size,nominal,0.128587,0.03054,0.045819
classical-size,rel,0.61176,10.930197,5.449726
buy_sell,nominal,0.033419,0.002169,0.005633
buy_sell,rel,0.407809,7.701407,2.175841


In [27]:
eff_dfs[2]

Unnamed: 0,TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
classical-size,nominal,0.038047,0.04227,0.040207,0.043894,0.051746
classical-size,rel,4.898021,5.758031,5.903699,9.915985,10.658864
buy_sell,nominal,0.002483,0.00445,0.00255,0.008393,0.009054
buy_sell,rel,2.509151,2.655425,2.759644,6.324218,5.848047


In [28]:
eff_dfs[3]

Unnamed: 0,year_binned,2015,2016,2017
classical-size,nominal,0.049976,0.043096,0.038179
classical-size,rel,6.17053,6.977945,7.15211
buy_sell,nominal,0.004728,0.007661,-0.001208
buy_sell,rel,2.566138,3.989698,3.495558


In [29]:
eff_dfs[4]

Unnamed: 0,ttm_binned,ttm <= 1 month,ttm (1-2] month,ttm (2-3] month,ttm (3-6] month,ttm (6-12] month,ttm > 12 month
classical-size,nominal,0.032901,0.044063,0.051212,0.063586,0.067209,0.108283
classical-size,rel,8.813692,4.097784,3.194841,3.071064,2.486236,2.139164
buy_sell,nominal,0.000629,0.008578,0.007093,0.015821,0.015205,0.029327
buy_sell,rel,4.957269,2.03894,1.25125,0.972358,0.829343,0.405703


In [30]:
eff_dfs[5]

Unnamed: 0,myn_binned,myn <= 0.7,myn (0.7-0.9],myn (0.9-1.1],myn (1.1-1.3],myn > 1.3
classical-size,nominal,0.019004,0.02725,0.041359,0.075467,0.098973
classical-size,rel,13.287595,13.433276,5.810759,0.96234,1.017483
buy_sell,nominal,-0.006607,0.004152,0.009502,-0.014094,-0.027938
buy_sell,rel,6.751363,8.50754,2.912377,-0.312249,-0.205555


In [31]:
eff_dfs[6]

Unnamed: 0,prox_q_binned,at mid,at quote,inside,outside
classical-size,nominal,0.0,-0.031389,0.054409,-0.014778
classical-size,rel,0.0,42.180633,5.327692,9.675288
buy_sell,nominal,0.0,-0.029704,0.008802,-0.022766
buy_sell,rel,0.0,28.324667,2.35371,8.698831


In [32]:
master = pd.concat(eff_dfs, axis=1, keys=print_keys).T

In [33]:
master

Unnamed: 0_level_0,Unnamed: 1_level_0,classical-size,classical-size,buy_sell,buy_sell
Unnamed: 0_level_1,Unnamed: 1_level_1,nominal,rel,nominal,rel
Option Type,C,0.045034,6.663335,0.004681,3.207661
Option Type,P,0.03936,7.277869,0.005291,4.310417
Security Type,Index option,0.128587,0.61176,0.033419,0.407809
Security Type,Others,0.03054,10.930197,0.002169,7.701407
Security Type,Stock options,0.045819,5.449726,0.005633,2.175841
Trade Size,"(0,1]",0.038047,4.898021,0.002483,2.509151
Trade Size,"(1,3]",0.04227,5.758031,0.00445,2.655425
Trade Size,"(3,5]",0.040207,5.903699,0.00255,2.759644
Trade Size,"(5,11]",0.043894,9.915985,0.008393,6.324218
Trade Size,>11,0.051746,10.658864,0.009054,5.848047


In [34]:
master.iloc[:,0:20]

Unnamed: 0_level_0,Unnamed: 1_level_0,classical-size,classical-size,buy_sell,buy_sell
Unnamed: 0_level_1,Unnamed: 1_level_1,nominal,rel,nominal,rel
Option Type,C,0.045034,6.663335,0.004681,3.207661
Option Type,P,0.03936,7.277869,0.005291,4.310417
Security Type,Index option,0.128587,0.61176,0.033419,0.407809
Security Type,Others,0.03054,10.930197,0.002169,7.701407
Security Type,Stock options,0.045819,5.449726,0.005633,2.175841
Trade Size,"(0,1]",0.038047,4.898021,0.002483,2.509151
Trade Size,"(1,3]",0.04227,5.758031,0.00445,2.655425
Trade Size,"(3,5]",0.040207,5.903699,0.00255,2.759644
Trade Size,"(5,11]",0.043894,9.915985,0.008393,6.324218
Trade Size,>11,0.051746,10.658864,0.009054,5.848047


In [35]:
master.iloc[:,21:40]

Option Type,C
Option Type,P
Security Type,Index option
Security Type,Others
Security Type,Stock options
Trade Size,"(0,1]"
Trade Size,"(1,3]"
Trade Size,"(3,5]"
Trade Size,"(5,11]"
Trade Size,>11
Year,2015
Year,2016
Year,2017
Time to Maturity,ttm <= 1 month
Time to Maturity,ttm (1-2] month
Time to Maturity,ttm (2-3] month
Time to Maturity,ttm (3-6] month
Time to Maturity,ttm (6-12] month
Time to Maturity,ttm > 12 month
Moneyness,myn <= 0.7
Moneyness,myn (0.7-0.9]
Moneyness,myn (0.9-1.1]
Moneyness,myn (1.1-1.3]
Moneyness,myn > 1.3
Location to Quote,at mid
Location to Quote,at quote
Location to Quote,inside
Location to Quote,outside


In [36]:
master.iloc[:,41:-1]

Option Type,C
Option Type,P
Security Type,Index option
Security Type,Others
Security Type,Stock options
Trade Size,"(0,1]"
Trade Size,"(1,3]"
Trade Size,"(3,5]"
Trade Size,"(5,11]"
Trade Size,>11
Year,2015
Year,2016
Year,2017
Time to Maturity,ttm <= 1 month
Time to Maturity,ttm (1-2] month
Time to Maturity,ttm (2-3] month
Time to Maturity,ttm (3-6] month
Time to Maturity,ttm (6-12] month
Time to Maturity,ttm > 12 month
Moneyness,myn <= 0.7
Moneyness,myn (0.7-0.9]
Moneyness,myn (0.9-1.1]
Moneyness,myn (1.1-1.3]
Moneyness,myn > 1.3
Location to Quote,at mid
Location to Quote,at quote
Location to Quote,inside
Location to Quote,outside


In [37]:
master.style.pipe(set_tex_style, caption=("master-short","master-long"), label=f"{key}-master-eff-spread")

## Change in Parenthesis

```latex
# https://tex.stackexchange.com/questions/430283/table-with-numbers-in-parentheses-in-siunitx/430290#430290
\begin{table}
    \centering
    \caption{test of combination with change}
    \label{tab:combo}
    \begin{tabular}{lSSSSSSSS}
        \toprule
        {} & \multicolumn{2}{l}{Index option} & \multicolumn{2}{l}{Others} & \multicolumn{2}{l}{Stock options} & \multicolumn{2}{l}{all} \\
        \midrule
        classical-size & 1.0 & \parl-56.42\parr & 2.0 & \parl-74.35\parr & -73.5 & \parl-143.93\parr & 5.0 & \parl-67.33\parr \\
        \bottomrule
        \end{tabular}
\end{table}
```

In [123]:
foo = result_dfs[1].copy()
bar = pd.DataFrame([[1.1,2 ,73, 5]], columns=foo.columns, index=foo.index)

In [124]:
foo

issue_type,Index option,Others,Stock options,all
classical-size,57.421532,76.345621,70.932783,72.326249


In [125]:
bar

issue_type,Index option,Others,Stock options,all
classical-size,1.1,2,73,5


In [135]:
def combine_results(revised: pd.DataFrame, base: pd.DataFrame) -> pd.DataFrame:
    """
    Generate print layout like in Grauer et al.
    
    https://tex.stackexchange.com/questions/430283/table-with-numbers-in-parentheses-in-siunitx/430290#430290
    
    # see p. https://texdoc.org/serve/siunitx/0
    """
    # first, second layer of colum index
    c_1 = revised.columns
    c_2 = ["nom"]
    midx = pd.MultiIndex.from_product([c_1, c_2])
    
    # copy data from revised add as (column, "nom")
    combo = pd.DataFrame(revised.values, index=revised.index, columns=midx)
    
    for i, mul_col in enumerate(combo.columns):
        
        # define custom brackets that are not parsed by sunitx
        combo[[(mul_col[0], "pm")]] = (
            "\parl" + (combo[mul_col] - base[mul_col[0]]).round(2).astype(str) + "\parr"
            )
        # sort to group together columns
        combo.sort_index(axis=1, inplace=True)
    return combo

In [136]:
combo = combine_results(bar, foo)

# manually replace S with S[table-format=1.4(5)] if needed
combo.style.to_latex(f"combo.tex", siunitx=True, position_float="centering", hrules=True, clines="skip-last;data", label="tab:combo", caption="test of combination with change", multicol_align="l")
             

In [137]:
combo

issue_type,Index option,Index option,Others,Others,Stock options,Stock options,all,all
Unnamed: 0_level_1,nom,pm,nom,pm,nom,pm,nom,pm
classical-size,1.1,\parl-56.32\parr,2.0,\parl-74.35\parr,73.0,\parl2.07\parr,5.0,\parl-67.33\parr
