In [1]:
import os
import sys
import pickle
from pathlib import Path

import pandas as pd
import wandb

from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import (
    features_categorical,
    features_classical,
    features_classical_size,
    features_ml,
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set globally here
EXCHANGE = "cboe" # "ise"  # "cboe"
STRATEGY = "transfer"  # "supervised"
SUBSET = "test"  # "all"


# ise-trained models, supervised/semisupervised
models = [
    ("classical", "2tcgk6lh_TransformerClassifier_default.pkl:latest"),
    ("classical-size", "2q55rbmr_TransformerClassifier_default.pkl:latest"),
    ("ml", "23bpisj6_TransformerClassifier_default.pkl:latest"),
]

In [3]:
# key used for files and artefacts
key = f"{EXCHANGE}_fttransformer_{STRATEGY}_{SUBSET}"
dataset = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest"


In [4]:
# set project name. Required to access files and artefacts
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [5]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact cboe_transfer_log_standardized_clipped:latest, 1404.96MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:34.2


In [6]:
if SUBSET == "all":
    train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet")
    val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet")
    test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
    data = pd.concat([train, val, test])

elif SUBSET == "test":
    data = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")

y_test = data["buy_sell"]
X_test = data.drop(columns="buy_sell")


In [7]:
X_test.head()

Unnamed: 0_level_0,TRADE_PRICE,bid_ask_size_ratio_ex,rel_bid_size_ex,rel_ask_size_ex,depth_ex,prox_ex,prox_best,spread_ex,spread_best,bid_ask_ratio_ex,...,ask_size_ex,day_vol,myn,STRK_PRC,mid_ex,mid_best,ttm,option_type,issue_type,root
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24700819,-0.194992,-1.132221,-0.546407,-0.601251,0.03157,0.027829,-2.592491,-0.490244,-0.00067,-3.523268,...,-2.475971,-0.737846,0.028751,-0.160605,-1.258131,-0.11052,-0.400391,0,0,1607
24700261,-0.771314,-0.856403,-0.188454,-0.434394,-0.994811,-1.215208,-1.134365,0.443898,0.000182,-1.012208,...,-0.629152,-0.102104,0.335588,-1.824558,-0.627286,-0.671108,-0.781149,0,1,431
24700842,-0.379865,-1.132221,-0.546407,-0.601251,0.03157,0.027829,-1.134363,-0.490244,-0.001367,-3.523268,...,-2.475971,-0.213252,0.292805,-0.467925,-1.258131,-0.3536,-0.781149,0,0,1541
24695039,-0.389286,-1.132221,-0.546407,-0.601251,0.03157,0.027829,1.174336,-0.490244,-0.00129,-3.523268,...,-2.475971,-1.347493,0.163824,0.413737,-1.258131,-0.412468,-0.781149,0,0,1388
24700841,-0.509259,-1.129773,3.749029,-0.316837,-0.994811,1.270865,1.174335,-0.303416,-0.001754,0.339901,...,0.849378,0.78072,0.248633,-0.508474,-0.514207,-0.519243,-0.781149,0,0,7821


## FT-Transformer

In [8]:
results = []

FEATURE_MAP = {
    "classical": features_classical,
    "classical-size": features_classical_size,
    "ml": features_ml,
    "semi-classical": features_classical,
    "semi-classical-size": features_classical_size,
    "semi-ml": features_ml,
}

for feature_str, model in tqdm(models):

    model_name = model.split("/")[-1].split(":")[0]

    artifact = run.use_artifact(model)
    model_dir = artifact.download()
    
    with open(Path(model_dir, model_name), 'rb') as f:
        model = pickle.load(f)

    fs = FEATURE_MAP.get(feature_str)
    # filter categorical features that are in subset and get cardinality
    cat_features_sub = [tup[0] for tup in features_categorical if tup[0] in fs]
    
    result = pd.Series(
        data=model.predict(X_test.loc[:, fs]),
        index=X_test.index,
        name=f"fttransformer({feature_str})",
    )
    results.append(result)

  0%|          | 0/3 [00:00<?, ?it/s][34m[1mwandb[0m:   1 of 1 files downloaded.  
 33%|███▎      | 1/3 [02:12<04:25, 132.97s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
 67%|██████▋   | 2/3 [05:19<02:44, 164.27s/it][34m[1mwandb[0m:   1 of 1 files downloaded.  
100%|██████████| 3/3 [09:27<00:00, 189.11s/it]


In [9]:
results = pd.concat(results, axis=1)
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()


