In [None]:
import json
import os
import pickle
from pathlib import Path

import gcsfs
import numpy as np
import pandas as pd
import wandb

from tqdm.notebook import tqdm

sys.path.append("..")
from otc.models.classical_classifier import ClassicalClassifier

In [None]:
seed = 42

In [None]:
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
dataset = "fbv/thesis/train_val_test_ultra:v0"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

# create artefact for results
dataset = wandb.Artifact(name="results_classical_cf", type="results")

In [None]:
# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb
features_categorical = [
    ("bin_root", 8667),
    ("bin_option_type", 2),
    ("bin_issue_type", 6),
]

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]


features_classical_size = [
    *features_classical,
    *features_size,
    "buy_sell",
]

In [None]:
X_test = pd.read_parquet(Path(data_dir, "test_set_20.parquet"),engine="fastparquet", columns=features_classical_size)
y_test= X_test["buy_sell"]
X_test = X_test[features_classical_size]

In [None]:
X_test.head()

In [20]:
# rules
rules = [
    [("tick", "all")],
    [("tick", "ex")],
    [("quote", "best")],
    [("quote", "ex")],
    [("lr", "ex")],
    [("lr", "best")],
    [("rev_lr", "ex")],
    [("rev_lr", "best")],
    [("emo", "ex")],
    [("emo", "best")],
    [("rev_emo", "ex")],
    [("rev_emo", "best")],
    [("clnv", "ex")],
    [("clnv", "best")],
    [("rev_clnv", "ex")],
    [("rev_clnv", "best")],
    [("trade_size", "ex"), ("tick", "all")], # classical + trade size
    [("trade_size", "ex"), ("quote", "best")], # classical + trade size
    [("trade_size", "ex"), ("quote", "best"), ("quote", "ex")], # classical + trade size
    [("quote", "best"), ("quote", "ex")], # murjajev
    [("trade_size", "ex"), ("depth", "ex"), ("quote", "best"), ("rev_lr", "ex")], # p. 13 grauer
]

# generate names for array
names = []
for r in rules:
    name = "->".join("%s_%s" % tup for tup in r)
    names.append(name)


In [21]:
names

['tick_all',
 'tick_ex',
 'quote_best',
 'quote_ex',
 'lr_ex',
 'lr_best',
 'rev_lr_ex',
 'rev_lr_best',
 'emo_ex',
 'emo_best',
 'rev_emo_ex',
 'rev_emo_best',
 'clnv_ex',
 'clnv_best',
 'rev_clnv_ex',
 'rev_clnv_best',
 'trade_size_ex->tick_all',
 'trade_size_ex->quote_best',
 'trade_size_ex->quote_best->quote_best',
 'quote_best->quote_ex',
 'trade_size_ex->depth_ex->quote_best->rev_lr_ex']

In [22]:
rules

[[('tick', 'all')],
 [('tick', 'ex')],
 [('quote', 'best')],
 [('quote', 'ex')],
 [('lr', 'ex')],
 [('lr', 'best')],
 [('rev_lr', 'ex')],
 [('rev_lr', 'best')],
 [('emo', 'ex')],
 [('emo', 'best')],
 [('rev_emo', 'ex')],
 [('rev_emo', 'best')],
 [('clnv', 'ex')],
 [('clnv', 'best')],
 [('rev_clnv', 'ex')],
 [('rev_clnv', 'best')],
 [('trade_size', 'ex'), ('tick', 'all')],
 [('trade_size', 'ex'), ('quote', 'best')],
 [('trade_size', 'ex'), ('quote', 'best'), ('quote', 'best')],
 [('quote', 'best'), ('quote', 'ex')],
 [('trade_size', 'ex'), ('depth', 'ex'), ('quote', 'best'), ('rev_lr', 'ex')]]

In [None]:
results = []

for rule in tqdm(rules):
    clf = ClassicalClassifier(
    layers=rule,
        random_state=seed,
    )
    # fit is only used to set sklearn attributes, no leakage
    clf.fit(X=X_test.loc[0:1, :], y=X_test["buy_sell"].loc[0:1])
    result = clf.predict(X_test)
    results.append(result)


In [None]:
results_classical_clf = pd.DataFrame(dict(zip(names, results)), index=X_test.index)

output_path = f"gs://thesis-bucket-option-trade-classification/data/results/classical_clf_ise.parquet"
results_classical_clf.to_parquet(output_path)

In [None]:
# Log the artifact to save it as an output of this run
dataset.add_reference(output_path, name="results_classical_clf_ise")
run.log_artifact(dataset)

wandb.finish()