In [1]:
import os
from pathlib import Path

import pandas as pd
import wandb
from catboost import CatBoostClassifier

from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set globally here
exchange = "cboe"  # "ise"
strategy = "supervised" # "transfer"  # "supervised"
subset = "test" # "test"

# ise, supervised
# models = [
#     ("classical", "17malsep_CatBoostClassifier_default.cbm:v7"),
#     ("classical-size", "3laathab_CatBoostClassifier_default.cbm:v7"),
#     ("ml", "2a9iqsn0_CatBoostClassifier_default.cbm:v4"),
# ]

# cboe, supervised
models = [
    ("classical", "30sl6vqf_CatBoostClassifier_default.cbm:v5"),
    ("classical-size", "2w28suql_CatBoostClassifier_default.cbm:v7"),
    ("ml", "2qzvvdbw_CatBoostClassifier_default.cbm:v7"),
]

In [3]:
# key used for files and artefacts
key = f"{exchange}_gbm_{strategy}_{subset}"
dataset = f"fbv/thesis/{exchange}_{strategy}_log_standardized:latest"

In [4]:
features_option = [
    "STRK_PRC",
    "ttm",
    "option_type",
    "issue_type",
    "root",
    "myn",
    "day_vol",
]

# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb
features_categorical = [
    ("root", 8667),
    ("option_type", 2),
    ("issue_type", 6),
]

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
]

features_ml = [*features_classical_size, *features_option]


In [5]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [6]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact cboe_supervised_log_standardized:latest, 4205.81MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:7.6


In [7]:
if subset == "all":
    train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet")
    val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet")
    test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
    data = pd.concat([train, val, test])
    del train, val, test

elif subset == "test":
    data = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")

y_test = data["buy_sell"]
X_test = data.drop(columns="buy_sell")


## CatBoost Baseline 🐈‍⬛

In [8]:
results = []

feature_map = {
    "classical": features_classical,
    "classical-size": features_classical_size,
    "ml": features_ml,
}

for feature_str, model in tqdm(models):

    model_name = model.split("/")[-1].split(":")[0]

    artifact = run.use_artifact(model)
    model_dir = artifact.download()

    model = CatBoostClassifier()
    model.load_model(fname=Path(model_dir, model_name))

    fs = feature_map.get(feature_str)
    result = pd.Series(
        data=model.predict(X_test.loc[:, fs]),
        index=X_test.index,
        name=f"gbm({feature_str})",
    )
    results.append(result)


  0%|          | 0/3 [00:00<?, ?it/s][34m[1mwandb[0m:   2 of 2 files downloaded.  
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
 33%|███▎      | 1/3 [00:49<01:39, 49.70s/it][34m[1mwandb[0m:   2 of 2 files downloaded.  
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
[34m[1mwandb[0m:   2 of 2 files downloaded.  
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
100%|██████████| 3/3 [02:40<00:00, 53.55s/it]


In [9]:
results = pd.concat(results, axis=1)
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()


