In [None]:
import os
import sys
from pathlib import Path

import pandas as pd
from sklearn.metrics import accuracy_score
import wandb
from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import features_classical_size
from otc.models.classical_classifier import ClassicalClassifier


In [None]:
# set here globally
seed = 42

exchange = "ise" # "cboe"
models = "classical"
subset = "val" # "test" # "all" # "test"
strategy =  "supervised" # "transfer"


In [None]:
# key used for files and artefacts
key = f"{exchange}_{models}_{strategy}_{subset}"

dataset = f"fbv/thesis/{exchange}_{strategy}_none:latest"


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


In [None]:
columns = [
    *features_classical_size,
    "buy_sell",
]


In [None]:
if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set.parquet"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns
    )
    data = pd.concat([train, val, test])
    del train, val, test

elif subset == "val":
    data = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=columns
    )    
    
elif subset == "test":
    data = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns
    )

y_test = data["buy_sell"].astype("int8")
X_test = data.drop(columns="buy_sell")

del data


In [None]:
rules = [ #classical
    [("tick", "all")],
    [("tick", "ex")],
    [("tick", "all"), ("tick", "ex")], 
    [("tick", "ex"), ("tick", "all")], 
    [("rev_tick", "all")],
    [("rev_tick", "ex")],
    [("rev_tick", "all"), ("rev_tick", "ex")], 
    [("rev_tick", "ex"), ("rev_tick", "all")], 
    [("quote", "best")],
    [("quote", "ex")],
    [("quote", "best"), ("quote", "ex")],  # murjajev
    [("quote", "ex"), ("quote", "best")], 
    [("lr", "ex")],
    [("lr", "best")],
    [("lr", "ex"), ("lr", "best")],
    [("lr", "best"), ("lr", "ex")],
    [("rev_lr", "ex")],
    [("rev_lr", "best")],
    [("rev_lr", "ex"), ("rev_lr", "best")],
    [("rev_lr", "best"), ("rev_lr", "ex")],    
    [("emo", "ex")],
    [("emo", "best")],
    [("emo", "ex"), ("emo", "best")],
    [("emo", "best"), ("emo", "ex")],       
    [("rev_emo", "ex")],
    [("rev_emo", "best")],
    [("rev_emo", "ex"), ("rev_emo", "best")],
    [("rev_emo", "best"), ("rev_emo", "ex")],   
    [("clnv", "ex")],
    [("clnv", "best")],
    [("clnv", "ex"), ("clnv", "best")],
    [("clnv", "best"), ("clnv", "ex")],   
    [("rev_clnv", "ex")],
    [("rev_clnv", "best")],
    [("rev_clnv", "ex"), ("rev_clnv", "best")],
    [("rev_clnv", "best"), ("rev_clnv", "ex")],
    [ # advanced rules
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
    ], 
    [("trade_size", "ex"), ("rev_lr", "best")],    
    [
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
        ("depth", "best"),
        ("depth", "ex"),
        ("rev_tick", "all"),
    ],  # p. 13 grauer
]

# generate names for array
names = []
for r in tqdm(rules):
    name = "->".join("%s(%s)" % tup for tup in r)
    names.append(name)


In [None]:
results = []

for rule in tqdm(rules):
    clf = ClassicalClassifier(layers=rule, random_state=seed, strategy="random")
    # fit is only used to set sklearn attributes, no leakage
    clf.fit(X=X_test.head(5), y=y_test.head(5))
    result = clf.predict(X_test).astype(int)
    results.append(result)


In [None]:
results = pd.DataFrame(dict(zip(names, results)), index=X_test.index)


In [None]:
%%script false --no-raise-error
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)


In [None]:
%%script false --no-raise-error
# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()


## Selection of benchmark🧮
run on `subset = val`, `exchange = ise`, and `strategy = random`.

In [None]:
scores = []
for name in tqdm(names):
    scores.append((name,accuracy_score(y_test, results[name])))
    

In [None]:
scores = pd.DataFrame(scores)
scores_df = scores.sort_values(by=1, ascending=False).set_index(0, drop=True)

In [None]:
scores_df

In [None]:
LUT = {
    "Rev_Tick": "\\operatorname{rtick}",
    "Rev_Lr": "\\operatorname{rlr}",
    "Rev_Emo": "\\operatorname{remo}",
    "Rev_Clnv": "\\operatorname{rclnv}",
    "Tick": "\operatorname{tick}",
    "Quote": "\operatorname{quote}",
    "(Ex)": "_{\\text{ex}}",
    "(Best)": "_{\\text{nbbo}}",
    "(All)": "_{\\text{all}}",
    "Depth": "\\operatorname{depth}",
    "Trade_Size": "\operatorname{tsize}",
    "Lr": "\\operatorname{lr}",
    "Emo": "\\operatorname{emo}",
    "Clnv": "\\operatorname{clnv}",
    "->": " \\to ",
}


def cell_str(x):
    x = x.title()
    for orig, sub in LUT.items():
        x = x.replace(orig, sub)
    # title-case everything
    return "$"+x+"$"


In [None]:
def set_tex_style(styler, caption, label, bold_axis=1):
    res = styler.set_caption(caption)

    res = (
        res
        .format(precision=4, decimal=".", thousands=",", escape=False, hyperlinks=None)
        .format_index(cell_str, axis=0)
        .to_latex(
            f"../reports/Content/{label}.tex",
            siunitx=True,
            position_float="centering",
            hrules=True,
            clines="skip-last;data",
            label="tab:" + label,
            caption=caption,
        )
    )
    return res

In [None]:
scores_df.style.pipe(
    set_tex_style,
    caption=(f"long-hyperparam-classical-{key}", f"short-hyperparam-classical-{key}"),
    label=f"tab:hyperparam-classical-{key}",
    bold_axis=0,
)
scores_df