In [None]:
import os
import sys
from pathlib import Path

import pandas as pd
import wandb

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay

sys.path.append("..")
from otc.models.classical_classifier import ClassicalClassifier


In [None]:
# set here globally
seed = 42

exchange = "ise"  # "ise"
models = "classical"
subset = "all" #"test" 
strategy = "supervised" # "transfer"


In [None]:
# key used for files and artefacts
key = f"{exchange}_{models}_{strategy}_{subset}"

dataset = f"fbv/thesis/{exchange}_{strategy}_raw:latest"


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


In [None]:
# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
]

features_size = [
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
]


columns = [
    *features_classical,
    *features_size,
    "buy_sell",
]


In [None]:
if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )
    data = pd.concat([train, val, test])
    del train, val, test

elif subset == "test":
    data = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )

y_test = data["buy_sell"].astype("int8")
X_test = data.drop(columns="buy_sell")

del data


In [None]:
# rules
rules = [
    [("tick", "all")],
    [("tick", "ex")],
    [("quote", "best")],
    [("quote", "ex")],
    [("lr", "ex")],
    [("lr", "best")],
    [("rev_lr", "ex")],
    [("rev_lr", "best")],
    [("emo", "ex")],
    [("emo", "best")],
    [("rev_emo", "ex")],
    [("rev_emo", "best")],
    [("clnv", "ex")],
    [("clnv", "best")],
    [("rev_clnv", "ex")],
    [("rev_clnv", "best")],
    [("trade_size", "ex"), ("tick", "all")],  # classical + trade size
    [("trade_size", "ex"), ("quote", "best")],  # classical + trade size
    [
        ("trade_size", "ex"),
        ("quote", "best"),
        ("quote", "ex"),
    ],  # classical + trade size
    [("quote", "best"), ("quote", "ex")],  # murjajev
    [
        ("trade_size", "ex"),
        ("quote", "best"),
        ("depth", "best"),
        ("quote", "ex"),
        ("depth", "ex"),
        ("rev_tick", "all"),
    ],  # p. 13 grauer
]

# generate names for array
names = []
for r in tqdm(rules):
    name = "->".join("%s(%s)" % tup for tup in r)
    names.append(name)


In [None]:
names


In [None]:
rules


In [None]:
results = []

for rule in tqdm(rules):
    clf = ClassicalClassifier(
        layers=rule,
        random_state=seed,
    )
    # fit is only used to set sklearn attributes, no leakage
    clf.fit(X=X_test.head(5), y=y_test.head(5))
    result = clf.predict(X_test)
    results.append(result)


In [None]:
results = pd.DataFrame(dict(zip(names, results)), index=X_test.index)


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 9))

for name in tqdm(names):
    RocCurveDisplay.from_predictions(
        results[name],
        y_test,
        pos_label=1,
        name=name,
        linewidth=1,
        ax=ax,
    )
plt.plot([0, 1], [0, 1], "k--", label="random")
plt.show()


In [None]:
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)


In [None]:
# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()
