In [None]:
import os
import random
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

sys.path.append("..")
import warnings

import wandb
from tqdm.auto import tqdm


In [None]:
# set here globally
EXCHANGE = "ise" # "ise"
MODELS = ["classical"] # "classical", "fttransformer", "gbm"
SUBSET = "all"  # "all"
STRATEGY = "supervised" # "supervised"  

RETRAIN = False

In [None]:
KEY = f"{EXCHANGE}_{STRATEGY}_{SUBSET}"
DATASET = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_none:latest"

os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(DATASET)  # type: ignore
data_dir = artifact.download()

# load results
result_dirs = []
for model in MODELS:
    # retraining is only possible for gbm
    if model == "gbm" and RETRAIN:
        results = f"fbv/thesis/{EXCHANGE}_{model}_{STRATEGY}_{SUBSET}_retrain:latest"
    else:
        results = f"fbv/thesis/{EXCHANGE}_{model}_{STRATEGY}_{SUBSET}:latest"
    artifact = run.use_artifact(results)  # type: ignore
    result_dir = artifact.download()
    result_dirs.append(result_dir)


In [None]:
# p. 35-38
COLUMNS = [
    "buy_sell",
    "ttm",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "ask_ex",
    "ask_size_ex",
    "bid_ex",
    "bid_size_ex",
    "myn",
    "option_type",
    "issue_type",
    "prox_ex",
    # "price_ex_lag",
    # "price_all_lag",
    # "BEST_BID",
    # "BEST_ASK",
]


if SUBSET == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set.parquet"), engine="fastparquet", columns=COLUMNS
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=COLUMNS
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=COLUMNS
    )
    eval_data = pd.concat([train, val, test])
    del train, val, test

elif SUBSET == "test":
    eval_data = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=COLUMNS
    )


results = []
for i, model in tqdm(enumerate(MODELS)):
    result = pd.read_parquet(Path(result_dirs[i], "results"), engine="fastparquet")
    result.columns = pd.MultiIndex.from_product([[model], result.columns])
    results.append(result)

results_data = pd.concat(results, axis=1, names=MODELS)

assert len(eval_data) == len(results_data)

X_print = eval_data

del results


In [None]:
X_print["issue_type"].values

In [None]:
bins_tradesize = [-1, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

# p. 37
bins_ttm = [-1, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "<= 1",
    "(1-2]",
    "(2-3]",
    "(3-6]",
    "(6-12]",
    "> 12",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# Security type
# see 3.0a-mb-explanatory-data-analysis.ipynb
X_print["issue_type"] = X_print["issue_type"].map(
    {
        "0": "Stock option",
        "A": "Index option",
        "7": "Others",
        "F": "Others",
        "%": "Others",
        " ": "Others",
    }
)

# Moneyness p. 38
bins_myn = [-1, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "<= 0.7",
    "(0.7-0.9]",
    "(0.9-1.1]",
    "(1.1-1.3]",
    "> 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

# mid p. 31 + extra category for unknowns
ask = X_print["ask_ex"]
bid = X_print["bid_ex"]
trade_price = X_print["TRADE_PRICE"]

# require ask >= bid
mid = np.where(ask >= bid, (ask + bid) * 0.5, np.nan)

prox_quotes = np.where(
    trade_price == mid,
    0,  # at mid
    np.where(
        (bid < trade_price) & (trade_price < ask),
        1,  # inside
        np.where(
            (trade_price == bid) | (ask == trade_price),
            2,  # at quotes
            np.where((trade_price < bid) | (ask < trade_price), 3, 4),
        ),
    ),
)  # outside + unclassifiable

bins_prox = [-np.inf, 0, 1, 2, 3, 4]
prox_labels = [
    "at mid",
    "inside",
    "at quotes",
    "outside",
    "unknown",
]

X_print["prox_q_binned"] = pd.cut(prox_quotes, bins_prox, labels=prox_labels)
X_print["mid"] = mid

# clean up empty buckets, as it causes empty grouping in result set generation
X_print["myn_binned"] = X_print["myn_binned"].cat.remove_unused_categories()
X_print["ttm_binned"] = X_print["ttm_binned"].cat.remove_unused_categories()
X_print["prox_q_binned"] = X_print["prox_q_binned"].cat.remove_unused_categories()

X_print["values"] = 1

In [None]:
X_print

In [None]:
X_print.head().T

## Trade size

In [None]:
pivot_table = pd.pivot_table(X_print, 
                             values='values',
                             columns='prox_q_binned',
                             index='TRADE_SIZE_binned',
               aggfunc=sum,
               fill_value=0,
               margins=True)
pivot_table.div(pivot_table.iloc[:,-1], axis=0 )

## Moneyness

In [None]:
pivot_table = pd.pivot_table(X_print, 
                             values='values',
                             columns='myn_binned',
                             index='TRADE_SIZE_binned',
               aggfunc=sum,
               fill_value=0,
               margins=True)
pivot_table.div(pivot_table.iloc[:,-1], axis=0 )

## Time-to-maturity

In [None]:
# savickas: trades with longer maturity tend to be smaller
pivot_table = pd.pivot_table(X_print, 
                             values='values',
                             index='ttm_binned',
                             columns='TRADE_SIZE_binned',
               aggfunc=sum,
               fill_value=0,
               margins=True)
pivot_table.div(pivot_table.iloc[:,-1], axis=0 )

## Index Options

In [None]:
pivot_table = pd.pivot_table(X_print, 
                             values='values',
                             index='issue_type',
                             columns=None,
               aggfunc=sum,
               fill_value=0,
               margins=True)
pivot_table.div(pivot_table.iloc[-1], axis=1)


## Distribution of Trades

In [None]:
pivot_table = pd.pivot_table(X_print, 
                             values='values',
                             index='prox_q_binned',
                             columns=None,
               aggfunc=sum,
               fill_value=0,
               margins=True)
pivot_table.div(pivot_table.iloc[-1], axis=1)

In [None]:
# mid p. 31 + extra category for unknowns
ask = X_print["ask_ex"]
bid = X_print["bid_ex"]
trade_price = X_print["TRADE_PRICE"]

# require ask >= bid
mid = np.where(ask >= bid, (ask + bid) * 0.5, np.nan)

results = []

# calculate true rel effective spread but not aggregated, convert to %
es_true = effective_spread(X_print["buy_sell"], X_print["TRADE_PRICE"], mid, mode="none")
nom_true = np.nanmean(es_true)

eps_true = np.empty(es_true.shape)
np.divide(es_true, mid, out=eps_true, where=mid != 0)
rel_true = np.nanmean(eps_true)


for classifier in tqdm(classifiers):

    # calculate pred rel effective spread but not aggregated convert to %
    es_pred = effective_spread(X_print[classifier], X_print["TRADE_PRICE"], mid, mode="none")
    
    eps_pred = np.empty(es_pred.shape)
    np.divide(es_pred, mid, out=eps_pred, where=mid != 0)

    wilcoxon_res  = wilcoxon(eps_pred, eps_true, nan_policy="omit", zero_method="zsplit")

    res = pd.Series(
            {
                "nom_pred": np.nanmean(es_pred),
                "rel_pred": np.nanmean(eps_pred),
                "statistic":wilcoxon_res.statistic,
                "pvalue":wilcoxon_res.pvalue,
            }, name=classifier
        )
    results.append(res)

true_eff = pd.Series({"nom_pred":nom_true, "rel_pred": rel_true, "statistic":np.NaN, "pvalue":np.NaN}, name="true_eff")

results.append(true_eff)

results = pd.concat(results, axis=1)

In [None]:
results.T.style.format("{:.3f}")


In [None]:
results.T.style.to_latex(
    f"../reports/Content/{KEY}-eff-spread.tex",
    siunitx=True,
    position_float="centering",
    hrules=True,
    clines="skip-last;data",
    label=f"tab:eff-{KEY}",
    caption=(f"long-eff-{KEY}", f"short-eff-{KEY}"),
    convert_css=True,
)
