In [None]:
import os
import sys
from pathlib import Path

import pandas as pd
import wandb
from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import features_classical_size
from otc.models.classical_classifier import ClassicalClassifier


In [None]:
# set here globally
seed = 42

exchange = "ise" # "cboe"
models = "classical"
subset = "test" # "test" # "all" # "test"
strategy =  "supervised" # "transfer"


In [None]:
# key used for files and artefacts
key = f"{exchange}_{models}_{strategy}_{subset}"

dataset = f"fbv/thesis/{exchange}_{strategy}_none:latest"


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


In [None]:
columns = [
    *features_classical_size,
    "buy_sell",
]


In [None]:
if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set.parquet"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns
    )
    data = pd.concat([train, val, test])
    del train, val, test

elif subset == "val":
    data = pd.read_parquet(
        Path(data_dir, "val_set.parquet"), engine="fastparquet", columns=columns
    )    
    
elif subset == "test":
    data = pd.read_parquet(
        Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns
    )

y_test = data["buy_sell"].astype("int8")


data["TRADE_SIZE"] = data["TRADE_SIZE"].astype('float32') # update dtype Int64 - Float32
X_test = data.drop(columns="buy_sell")

del data


In [None]:
rules = [ #classical
    [("tick", "ex")],
    [("rev_tick", "ex")],
    [("tick", "all")],
    [("rev_tick", "all")],
    [("quote", "ex")],
    [("quote", "best")],
    [("lr", "ex")],
    [("rev_lr", "ex")],
    [("emo", "ex")],
    [("rev_emo", "ex")],
    [("clnv", "ex")],
    [("rev_clnv", "ex")],
    [("lr", "best")],
    [("rev_lr", "best")],
    [("emo", "best")],
    [("rev_emo", "best")],
    [("clnv", "best")],
    [("rev_clnv", "best")],
    [("quote", "best"), ("quote", "ex"), ("rev_tick", "all")], # grauer (benchmark 1)
    [
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
        ("depth", "best"),
        ("depth", "ex"),
        ("rev_tick", "all"),
    ],  # grauer (benchmark 2) 
]

# generate names for array
names = []
for r in tqdm(rules):
    name = "->".join("%s(%s)" % tup for tup in r)
    names.append(name)


In [None]:
results = []

for rule in tqdm(rules):
    clf = ClassicalClassifier(layers=rule, random_state=seed, strategy="none")
    # fit is only used to set sklearn attributes, no leakage
    clf.fit(X=X_test.head(5), y=y_test.head(5))
    result = clf.predict(X_test).astype(int)
    results.append(result)


In [None]:
results = pd.DataFrame(dict(zip(names, results)), index=X_test.index)


In [None]:
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)


In [None]:
# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()
