In [59]:
import os
import sys
from pathlib import Path

import pandas as pd
from sklearn.metrics import accuracy_score
import wandb
from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import features_classical_size
from otc.models.classical_classifier import ClassicalClassifier


In [60]:
# set here globally
seed = 42

exchange = "ise" # "cboe"
models = "classical"
subset = "test" # "all" # "test"
strategy =  "supervised" # "transfer"


In [61]:
# key used for files and artefacts
key = f"{exchange}_{models}_{strategy}_{subset}"

dataset = f"fbv/thesis/{exchange}_{strategy}_none:latest"


In [62]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


[34m[1mwandb[0m: Downloading large artifact ise_supervised_none:latest, 4203.92MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0


In [63]:
columns = [
    *features_classical_size,
    "buy_sell",
]


In [64]:
if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set.parquet"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns
    )
    data = pd.concat([train, val, test])
    del train, val, test

elif subset == "val":
    data = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=columns
    )    
    
elif subset == "test":
    data = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns
    )

y_test = data["buy_sell"].astype("int8")
X_test = data.drop(columns="buy_sell")

del data


In [65]:
rules = [ #classical
    [("tick", "all")],
    [("tick", "ex")],
    [("tick", "all"), ("tick", "ex")], 
    [("tick", "ex"), ("tick", "all")], 
    [("rev_tick", "all")],
    [("rev_tick", "ex")],
    [("rev_tick", "all"), ("rev_tick", "ex")], 
    [("rev_tick", "ex"), ("rev_tick", "all")], 
    [("quote", "best")],
    [("quote", "ex")],
    [("quote", "best"), ("quote", "ex")],  # murjajev
    [("quote", "ex"), ("quote", "best")], 
    [("lr", "ex")],
    [("lr", "best")],
    [("lr", "ex"), ("lr", "best")],
    [("lr", "best"), ("lr", "ex")],
    [("rev_lr", "ex")],
    [("rev_lr", "best")],
    [("rev_lr", "ex"), ("rev_lr", "best")],
    [("rev_lr", "best"), ("rev_lr", "ex")],    
    [("emo", "ex")],
    [("emo", "best")],
    [("emo", "ex"), ("emo", "best")],
    [("emo", "best"), ("emo", "ex")],       
    [("rev_emo", "ex")],
    [("rev_emo", "best")],
    [("rev_emo", "ex"), ("rev_emo", "best")],
    [("rev_emo", "best"), ("rev_emo", "ex")],   
    [("clnv", "ex")],
    [("clnv", "best")],
    [("clnv", "ex"), ("clnv", "best")],
    [("clnv", "best"), ("clnv", "ex")],   
    [("rev_clnv", "ex")],
    [("rev_clnv", "best")],
    [("rev_clnv", "ex"), ("rev_clnv", "best")],
    [("rev_clnv", "best"), ("rev_clnv", "ex")],
    [ # advanced rules
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
    ], 
    [("trade_size", "ex"), ("rev_lr", "best")],    
    [
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
        ("depth", "best"),
        ("depth", "ex"),
        ("rev_tick", "all"),
    ],  # p. 13 grauer
]

# generate names for array
names = []
for r in tqdm(rules):
    name = "->".join("%s(%s)" % tup for tup in r)
    names.append(name)


100%|██████████| 39/39 [00:00<00:00, 118363.14it/s]


In [66]:
results = []

for rule in tqdm(rules):
    clf = ClassicalClassifier(layers=rule, random_state=seed, strategy="const")
    # fit is only used to set sklearn attributes, no leakage
    clf.fit(X=X_test.head(5), y=y_test.head(5))
    result = clf.predict(X_test).astype(int)
    results.append(result)


100%|██████████| 39/39 [00:45<00:00,  1.16s/it]


In [67]:
results = pd.DataFrame(dict(zip(names, results)), index=X_test.index)


In [68]:
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)


In [69]:
# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()




## Selection of benchmark🧮
run on `subset = val`, `exchange = ise`, and `strategy = random`.

In [12]:
scores = []
for name in tqdm(names):
    scores.append((name,accuracy_score(y_test, results[name])))
    

100%|██████████| 39/39 [00:33<00:00,  1.16it/s]


In [17]:
scores = pd.DataFrame(scores)
scores_df = scores.sort_values(by=1, ascending=False).set_index(0, drop=True)

In [32]:
scores_df

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all),0.688359
trade_size(ex)->quote(best)->quote(ex),0.683255
trade_size(ex)->rev_lr(best),0.681468
quote(best)->quote(ex),0.586736
rev_lr(best)->rev_lr(ex),0.586463
rev_lr(best),0.586426
lr(best)->lr(ex),0.585902
lr(best),0.585895
quote(best),0.585451
quote(ex)->quote(best),0.578577


In [20]:
scores_df.style.to_latex(
    f"../reports/content/hyperparam-classical-{key}.tex",
    siunitx=True,
    position_float="centering",
    hrules=True,
    clines="skip-last;data",
    label=f"tab:hyperparam-classical-{key}",
    caption=(f"long-hyperparam-classical-{key}", f"short-hyperparam-classical-{key}"),
    convert_css=True,
)