In [44]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

sys.path.append("..")


import wandb

from tqdm.notebook import tqdm


In [2]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/train_val_test_ultra:v0"
results = "fbv/thesis/results_classical_clf:v0"

fname_dataset = "test_set_extended_20"
fname_results = "results_classical_clf_ise"


# load unscaled data
artifact = run.use_artifact(dataset) # type: ignore
data_dir = artifact.download()

# load results
artifact = run.use_artifact(results) # type: ignore
results_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact train_val_test_ultra:v0, 3391.53MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact results_classical_clf:v0, 63.63MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0


In [36]:
# p. 35-38

columns = [
    "buy_sell",
    "EXPIRATION",
    "QUOTE_DATETIME",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "ask_ex",
    "bid_ex",
    "myn",
    "OPTION_TYPE",
    "issue_type"
]

eval_data = pd.read_parquet(
    Path(data_dir, fname_dataset), engine="fastparquet", columns=columns
)
results_data = pd.read_parquet(
    Path(results_dir, fname_results), engine="fastparquet"
)

assert len(eval_data) == len(results_data)


### Robustness Checks

In [37]:
X_print.head()

Unnamed: 0,buy_sell,EXPIRATION,QUOTE_DATETIME,TRADE_SIZE,TRADE_PRICE,ask_ex,bid_ex,myn,OPTION_TYPE,issue_type
39342171,-1,2015-12-18,2015-11-06 09:30:00,2,0.52,0.6,0.52,0.921659,P,%
39342172,-1,2015-11-27,2015-11-06 09:30:00,1,7.82,8.15,7.6,1.001696,C,0
39342173,-1,2017-01-20,2015-11-06 09:30:00,1,28.889999,32.049999,28.799999,0.886115,C,0
39342174,1,2015-11-20,2015-11-06 09:30:00,1,2.25,2.25,1.85,1.009261,C,%
39342175,-1,2015-12-18,2015-11-06 09:30:00,1,1.7,1.95,1.7,0.99188,C,%


In [38]:

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)

X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

# p. 38
bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

# p. 37
bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# Security type
# see 3.0a-mb-explanatory-data-analysis.ipynb
X_print["issue_type"] = X_print["issue_type"].map({'0': 'Stock options', 'A': 'Index option', '7': 'Others', 'F':'Others', '%': 'Others', ' ': 'Others'})

# Moneyness p. 38
bins_myn = [-1, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "myn <= 0.7",
    "myn (0.7-0.9]",
    "myn (0.9-1.1]",
    "myn (1.1-1.3]",
    "myn > 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

# proximity to quotes
mid_ex = 0.5 * (X_print["ask_ex"] + X_print["bid_ex"])
spread_ex = X_print["ask_ex"] - X_print["bid_ex"]
X_print["prox_q"] = (X_print["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)

# p. 31
def map_quotes(x):
    abs_x = np.abs(x)
    if 1 < x < np.inf:
        return "outside"
    elif x == 0:
        return "at mid"
    elif x == 1:
        return "at quote"
    else:
        return "inside"
    
X_print["prox_q_binned"] = X_print["prox_q"].apply(map_quotes)

# clean up empty buckets, as it causes empty grouping in result set generatio
X_print["year_binned"] = X_print["year_binned"].cat.remove_unused_categories()
X_print["myn_binned"] = X_print["myn_binned"].cat.remove_unused_categories()
X_print["ttm_binned"] = X_print["ttm_binned"].cat.remove_unused_categories()


X_print.drop(columns=["EXPIRATION","QUOTE_DATETIME", "TRADE_SIZE", "ttm", "myn", "TRADE_PRICE", "prox_q", "ask_ex", "bid_ex", "year"], inplace=True)


In [112]:
X_print.head(20)

Unnamed: 0,buy_sell,OPTION_TYPE,issue_type,year,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,prox_q_binned,tick_all,...,rev_emo_best,clnv_ex,clnv_best,rev_clnv_ex,rev_clnv_best,trade_size_ex->tick_all,trade_size_ex->quote_best,trade_size_ex->quote_best->quote_ex,quote_best->quote_ex,trade_size_ex->depth_ex->quote_best->rev_lr_ex
39342171,-1,P,Others,2015,"(1,3]",2015,ttm <= 1 month,myn (0.9-1.1],inside,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0
39342172,-1,C,Stock options,2015,"(0,1]",2015,ttm <= 1 month,myn (0.9-1.1],inside,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342173,-1,C,Stock options,2015,"(0,1]",2015,ttm > 12 month,myn (0.7-0.9],inside,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342174,1,C,Others,2015,"(0,1]",2015,ttm <= 1 month,myn (0.9-1.1],outside,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
39342175,-1,C,Others,2015,"(0,1]",2015,ttm <= 1 month,myn (0.9-1.1],inside,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342176,1,P,Others,2015,"(3,5]",2015,ttm <= 1 month,myn (0.9-1.1],inside,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
39342177,1,P,Others,2015,"(3,5]",2015,ttm <= 1 month,myn (0.9-1.1],inside,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
39342178,-1,P,Others,2015,"(3,5]",2015,ttm <= 1 month,myn (0.9-1.1],inside,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
39342179,1,C,Stock options,2015,"(5,11]",2015,ttm > 12 month,myn > 1.3,inside,1.0,...,-1.0,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0
39342180,1,P,Stock options,2015,"(1,3]",2015,ttm > 12 month,myn (0.7-0.9],inside,-1.0,...,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0,1.0


In [41]:
X_print = pd.concat([X_print, results_data], axis=1)

In [116]:
classifiers = results_data.columns.tolist()
criterions = ["OPTION_TYPE", "issue_type", "TRADE_SIZE_binned", "year_binned","ttm_binned","myn_binned","prox_q_binned"]

In [117]:
result_dfs = []

for criterion in tqdm(criterions):
    results = []
    for classifier in tqdm(classifiers):
        res = (X_print.groupby([criterion])[["buy_sell", classifier]]
            .apply(lambda x: accuracy_score(x["buy_sell"], x[classifier]))
            .mul(100)
            .rename(classifier))
        results.append(res)

    # save aggregated results
    result_df = pd.concat(results, axis=1).T
    result_dfs.append(result_df)

    # https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.format.html#pandas.io.formats.style.Styler.format
    # tex_df = results_df.style.format(decimal=',', thousands='.', precision=2)
    # tex_df.to_latex(f"results_classical_rules_{criterion}.tex", siunitx=True)
    
    result_df.style.to_latex(f"results_classical_rules_{criterion}.tex", siunitx=True)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [118]:
result_dfs[0]

OPTION_TYPE,C,P
tick_all,53.5739,53.078335
tick_ex,50.372016,50.097673
quote_best,58.711418,60.582925
quote_ex,56.344838,57.740199
lr_ex,56.418422,57.801998
lr_best,58.774523,60.594004
rev_lr_ex,56.486528,57.888561
rev_lr_best,58.792914,60.650285
emo_ex,53.505205,54.11172
emo_best,55.509078,56.409156


In [119]:
result_dfs[1]

issue_type,Index option,Others,Stock options
tick_all,51.554245,53.257312,53.40457
tick_ex,50.87187,49.870976,50.384689
quote_best,57.8147,64.962501,57.447408
quote_ex,53.720453,62.409791,54.866364
lr_ex,53.782831,62.501336,54.925098
lr_best,57.820371,64.982376,57.494341
rev_lr_ex,53.667527,62.607179,54.993031
rev_lr_best,57.820371,65.030275,57.526223
emo_ex,51.285832,57.505227,52.329176
emo_best,51.291502,60.096614,54.321831


In [120]:
result_dfs[2]

TRADE_SIZE_binned,"(0,1]","(1,3]","(3,5]","(5,11]",>11
tick_all,52.848691,52.830617,52.4113,53.630853,55.550232
tick_ex,50.28974,50.22292,49.648294,50.186182,50.750727
quote_best,58.064698,57.950477,58.330811,62.096424,63.258972
quote_ex,55.290135,55.30764,55.736591,59.89012,60.706061
lr_ex,55.423031,55.397594,55.774439,59.902085,60.697694
lr_best,58.131272,57.984292,58.363067,62.118031,63.271784
rev_lr_ex,55.480578,55.452455,55.932575,59.987698,60.765154
rev_lr_best,58.176407,58.033288,58.418537,62.14864,63.260737
emo_ex,51.933208,51.950476,52.81453,57.157071,57.237072
emo_best,54.044949,54.083813,54.966063,59.130466,59.628784


In [121]:
result_dfs[3]

year_binned,2015,2016,2017
tick_all,52.788691,53.281666,53.670502
tick_ex,50.381045,50.18311,50.339021
quote_best,56.015922,59.729647,60.451627
quote_ex,54.878241,57.578845,56.356885
lr_ex,54.887934,57.639197,56.462584
lr_best,56.014491,59.765951,60.509837
rev_lr_ex,54.995435,57.719699,56.520163
rev_lr_best,56.094015,59.811811,60.508576
emo_ex,52.835172,54.268153,52.995354
emo_best,53.673481,56.083783,56.330285


In [122]:
result_dfs[4]

ttm_binned,ttm <= 1 month,ttm (1-2] month,ttm (2-3] month,ttm (3-6] month,ttm (6-12] month,ttm > 12 month
tick_all,53.070524,53.404782,53.689628,53.882093,54.278666,54.903416
tick_ex,50.124143,50.234919,50.465958,50.823732,50.631747,50.270019
quote_best,60.417557,60.309049,59.459459,57.685187,57.463633,50.800908
quote_ex,57.307837,57.732111,57.093101,55.978766,56.419706,52.670582
lr_ex,57.366438,57.765969,57.177345,56.112714,56.47325,52.850764
lr_best,60.43845,60.311838,59.50221,57.790397,57.546824,51.028867
rev_lr_ex,57.469937,57.832292,57.172525,56.099023,56.494991,52.86652
rev_lr_best,60.505662,60.322195,59.447095,57.750558,57.490944,51.003962
emo_ex,54.413938,53.550872,52.892041,52.026183,51.948605,51.190243
emo_best,56.810467,56.021255,54.903822,53.498997,52.866938,51.452765


In [123]:
result_dfs[5]

myn_binned,myn <= 0.7,myn (0.7-0.9],myn (0.9-1.1],myn (1.1-1.3],myn > 1.3
tick_all,54.477316,55.496913,52.948661,51.638894,52.034053
tick_ex,49.538611,50.675156,50.186938,50.100022,50.042366
quote_best,61.25958,63.39487,60.04545,50.023348,48.772913
quote_ex,60.365344,60.374359,57.084807,49.931615,48.710672
lr_ex,60.428423,60.574909,57.115398,49.995855,48.819439
lr_best,61.388021,63.54531,60.040269,50.099607,48.903247
rev_lr_ex,60.572848,60.548998,57.214293,50.109692,48.858878
rev_lr_best,61.458235,63.472991,60.103392,50.166887,48.882295
emo_ex,58.223516,57.563791,53.144117,49.876078,50.041134
emo_best,59.487948,59.872819,55.577691,50.262765,50.184718


In [124]:
result_dfs[6]

prox_q_binned,at mid,at quote,inside,outside
tick_all,51.026223,53.438907,53.556692,55.386836
tick_ex,49.21332,45.837103,50.762235,47.280674
quote_best,55.901058,59.749921,60.019153,59.826754
quote_ex,49.98518,59.519436,57.60581,59.58508
lr_ex,50.625876,59.519436,57.610258,59.58508
lr_best,56.239301,59.893951,60.009859,59.998813
rev_lr_ex,51.205073,59.519436,57.632824,59.58508
rev_lr_best,56.682999,59.751566,60.013477,59.858002
emo_ex,51.026223,59.519436,53.558564,59.556997
emo_best,55.385577,59.260986,55.668726,59.168183
