In [1]:
import json
import os
import pickle
from pathlib import Path

import gcsfs
import google.auth
import numpy as np
import optuna
import pandas as pd
import wandb
from catboost import CatBoostClassifier

from tqdm.notebook import tqdm


In [20]:
# set globally here
exchange = "ise"
features = "classical"
strategy = "supervised"
subset = "test"

model = "xl3n4thc_CatBoostClassifier_default.cbm:v9"

# set for "classical", "classical-size", and "ml" accordingly
models = [
("classical-size", "xl3n4thc_CatBoostClassifier_default.cbm:v9"),
("classical-size", "xl3n4thc_CatBoostClassifier_default.cbm:v9"),
("classical-size", "xl3n4thc_CatBoostClassifier_default.cbm:v9"),
]


In [9]:
# key used for files and artefacts
key = f"{exchange}_gbm_{strategy}_{subset}"
dataset = f"fbv/thesis/{exchange}_{strategy}_log_standardized:latest"
study_id = model.split("_")[0]

In [10]:
fs = gcsfs.GCSFileSystem(project="thesis")
fs_prefix = "gs://"


In [11]:
features_date = [
    "date_month_sin",
    "date_month_cos",
    "date_time_sin",
    "date_time_cos",
    "date_weekday_sin",
    "date_weekday_cos",
    "date_day_sin",
    "date_day_cos",
]

features_option = [
    "STRK_PRC",
    "ttm",
    "bin_option_type",
    "bin_issue_type",
    "bin_root",
    "myn",
    "day_vol",
]

# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb
features_categorical = [
    ("bin_root", 8667),
    ("bin_option_type", 2),
    ("bin_issue_type", 6),
]

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
]

features_ml = [*features_classical_size, *features_date, *features_option]

features_unused = [
    "price_rel_nbb",
    "price_rel_nbo",
    "date_year",
    "mid_ex",
    "mid_best",
    "spread_ex",
    "spread_best",
]


In [12]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [13]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()



VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670021430278816, max=1.0…

[34m[1mwandb[0m: Downloading large artifact ise_supervised_log_standardized:latest, 3813.29MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:4.5


In [17]:
if subset == "all":
    train = pd.read_parquet(
        Path(data_dir, "train_set_60.parquet"), engine="fastparquet"
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set_20.parquet"), engine="fastparquet"
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set_20.parquet"), engine="fastparquet"
    )
    data = pd.concat([train,val,test])
    del train, val, test
    
elif subset == "test":
    data = pd.read_parquet(
        Path(data_dir, "test_set_20.parquet"), engine="fastparquet"
    )

y_test= data["buy_sell"]
X_test = data.drop(columns="buy_sell")

## CatBoost Baseline 🐈‍⬛

In [28]:
results = []

feature_map = {"classical": features_classical, "classical-size": features_classical_size, "ml": features_ml}

for feature_str, model in tqdm(models):

    model_name = model.split("/")[-1].split(":")[0]
    
    artifact = run.use_artifact(model)
    model_dir = artifact.download()
    
    model = CatBoostClassifier()
    model.load_model(fname=Path(model_dir, model_name))
    
    fs = feature_map.get(feature_str)
    
    result = model.predict(X_test.loc[:,fs])
    results.append(result)

  0%|          | 0/3 [00:00<?, ?it/s]

[34m[1mwandb[0m:   2 of 2 files downloaded.  
[34m[1mwandb[0m:   2 of 2 files downloaded.  
[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [29]:
results = pd.DataFrame(dict(zip([f[0] for f in models], results)), index=X_test.index)
output_path = f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
results.to_parquet(output_path)

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()

