In [None]:
import os
import sys
import pickle
from pathlib import Path

import pandas as pd
import wandb

from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import (
    features_categorical,
    features_classical,
    features_classical_size,
    features_ml,
)


In [None]:
# set globally here
EXCHANGE = "ise"  # "cboe"
STRATEGY = "supervised"  # "transfer"
SUBSET = "test"  # "all"


# ise-trained models, supervised/semisupervised
models = [
    ("classical-size", "250k9zv2_TransformerClassifier_default.pkl:v7"),
]

# cboe-trained models, supervised
# models = [
#     ("classical", "30sl6vqf_CatBoostClassifier_default.cbm:v5"),
#     ("classical-size", "2w28suql_CatBoostClassifier_default.cbm:v7"),
#     ("ml", "2qzvvdbw_CatBoostClassifier_default.cbm:v7"),
# ]


In [None]:
# key used for files and artefacts
key = f"{EXCHANGE}_fttransformer_{STRATEGY}_{SUBSET}"
dataset = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest"


In [None]:
# set project name. Required to access files and artefacts
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [None]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()


In [None]:
if SUBSET == "all":
    train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet")
    val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet")
    test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
    data = pd.concat([train, val, test])

elif SUBSET == "test":
    data = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")

y_test = data["buy_sell"]
X_test = data.drop(columns="buy_sell")


## FT-Transformer

In [None]:
results = []

FEATURE_MAP = {
    "classical": features_classical,
    "classical-size": features_classical_size,
    "ml": features_ml,
    "semi-classical": features_classical,
    "semi-classical-size": features_classical_size,
    "semi-ml": features_ml,
}

for feature_str, model in tqdm(models):

    model_name = model.split("/")[-1].split(":")[0]

    artifact = run.use_artifact(model)
    model_dir = artifact.download()
    
    with open(Path(model_dir, model_name), 'rb') as f:
        model = pickle.load(f)

    fs = FEATURE_MAP.get(feature_str)
    # filter categorical features that are in subset and get cardinality
    cat_features_sub = [tup[0] for tup in features_categorical if tup[0] in fs]
    
    result = pd.Series(
        data=model.predict(X_test.loc[:, fs]),
        index=X_test.index,
        name=f"fttransformer({feature_str})",
    )
    results.append(result)

In [None]:
results = pd.concat(results, axis=1)
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet"
)
results.to_parquet(output_path)

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()


In [None]:
# TODO: verify there are no errors when piecing everything together.
results.head()

In [None]:
print(model._stats_step)