In [None]:
import gc
import os
import pickle
from pathlib import Path
from typing import Literal

import gcsfs
import google.auth
import numpy as np
import pandas as pd
import wandb
from catboost import CatBoostClassifier
from sklearn.exceptions import NotFittedError
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
)
from tqdm.auto import tqdm

In [None]:
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)

In [None]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

In [None]:
# set here globally
seed = 42

exchange = "ise"  # "ise"  # "cboe"
strategy = "supervised"  # "supervised" #"unsupervised" # "supervised"  # "transfer" # "unsupervised"
mode = "none"  # "none" # "log_standardized"

In [None]:
dataset = f"fbv/thesis/{exchange}_{strategy}_raw:latest"

os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

In [None]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "issue_type",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
    "day_vol",
    "myn",
]

In [None]:
if strategy == "supervised":
    train = pd.read_parquet(
        Path(data_dir, "train_set"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )

elif strategy == "unsupervised":
    # load unlabelled training set
    train = pd.read_parquet(
        Path(data_dir, "train_set"), engine="fastparquet", columns=columns
    )

elif strategy == "transfer":
    # load test set
    test = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )

In [None]:
num_features = [
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "day_vol",
    "myn",
]

## Box Cox Test

In [None]:
%%script false --no-raise-error

train[num_features].min()


In [None]:
%%script false --no-raise-error

box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features] + 0.01)


In [None]:
%%script false --no-raise-error

lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas


Use smallest possible constant for Box-Cox test. All $\lambda \approx 0 \implies \log(\cdot)$ for price, size, and quotes.

In [None]:
if strategy == "supervised":
    scaler = StandardScaler()
    oe_option_type = OrdinalEncoder(
        unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
    )
    oe_root = OrdinalEncoder(
        unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
    )
    oe_issue_type = OrdinalEncoder(
        unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
    )
else:
    # if mode transfer or mode unsupervised -> use scaler from ise supervised dataset
    # if mode supervised -> fit scaler on ise / cboe training set and apply on validation and test set

    # TODO: Fix if I get unlabelled CBOE dataset
    artifact = run.use_artifact(f"fbv/thesis/ise_supervised_{mode}_scaler:latest")
    scaler_dir = artifact.download()
    scalers = pickle.load(open(Path(scaler_dir, "scalers.sklearn"), "rb"))

    # set fitted scalers
    scaler = scalers["scaler"]
    oe_option_type = scalers["oe_option_type"]
    oe_root = scalers["oe_root"]
    oe_issue_type = scalers["oe_issue_type"]


def transform(
    data: pd.DataFrame,
    mode: Literal["log_standarized", "none"] = "log_standardized",
) -> pd.DataFrame:
    """Create features, impute, and scale.

    Args:
        data (pd.DataFrame): input data frame.

    Returns:
        pd.DataFrame: updated data frame.
    """
    # set up df, overwrite later
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x["depth_ex"] = data["bid_size_ex"] - data["ask_size_ex"]

    # classical
    cond_ex = data["ask_ex"] >= data["bid_ex"]
    cond_best = data["BEST_ASK"] >= data["BEST_BID"]

    # assume positive spread
    mid_ex = np.where(cond_ex, 0.5 * (data["ask_ex"] + data["bid_ex"]), np.nan)
    mid_best = np.where(cond_best, 0.5 * (data["BEST_ASK"] + data["BEST_BID"]), np.nan)

    spread_ex = np.where(cond_ex, data["ask_ex"] - data["bid_ex"], np.nan)
    spread_best = np.where(cond_best, data["BEST_ASK"] - data["BEST_BID"], np.nan)

    x["prox_ex"] = (data["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
    x["prox_best"] = (data["TRADE_PRICE"] - mid_best) / (0.5 * spread_best)

    # custom features
    x["spread_ex"] = spread_ex
    x["spread_best"] = spread_best
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]

    # calculate change
    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    if "clip" in mode:
        print("clipping...")
        # apply clipping, avoids exploding / vanishing gradients
        to_clip = [
            "chg_ex_lead",
            "chg_ex_lag",
            "chg_all_lead",
            "chg_all_lag",
            "prox_ex",
            "prox_best",
            "bid_ask_size_ratio_ex",
            "rel_bid_size_ex",
            "rel_ask_size_ex",
            "depth_ex",
        ]
        x[to_clip] = x[to_clip].clip(-3, 3)

    if "log" in mode:
        print("log transform...")
        # log transformed features
        x[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
            ]
        ] = np.log1p(
            data[
                [
                    "ask_ex",
                    "bid_ex",
                    "BEST_ASK",
                    "BEST_BID",
                    "TRADE_PRICE",
                    "price_all_lag",
                    "price_all_lead",
                    "price_ex_lag",
                    "price_ex_lead",
                    "TRADE_SIZE",
                    "bid_size_ex",
                    "ask_size_ex",
                    "day_vol",
                    "myn",
                    "STRK_PRC",
                ]
            ]
        )

        x["mid_ex"] = np.log1p(mid_ex)
        x["mid_best"] = np.log1p(mid_best)

        x["ttm"] = (
            data["EXPIRATION"].dt.to_period("M")
            - data["QUOTE_DATETIME"].dt.to_period("M")
        ).apply(lambda x: x.n)

        # save num columns for scaler
        num_cols = x.columns.tolist()

        # impute with zeros
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        x.fillna(0, inplace=True)

        # standardize continous columns (w/o date features)
        # bin encode categorical features
        try:
            x[num_cols] = scaler.transform(x[num_cols])
            x["option_type"] = oe_option_type.transform(
                data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
            )
            x["issue_type"] = oe_issue_type.transform(
                data["issue_type"].astype(str).values.reshape(-1, 1)
            )
            x["root"] = oe_root.transform(
                data["ROOT"].astype(str).values.reshape(-1, 1)
            )
            print("transform (val + test)")
        except NotFittedError:
            x[num_cols] = scaler.fit_transform(x[num_cols])
            x["option_type"] = oe_option_type.fit_transform(
                data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
            )
            x["issue_type"] = oe_issue_type.fit_transform(
                data["issue_type"].astype(str).values.reshape(-1, 1)
            )
            x["root"] = oe_root.fit_transform(
                data["ROOT"].astype(str).values.reshape(-1, 1)
            )
            print("fit_transform (train)")
    if mode == "none":
        x[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
            ]
        ] = data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
            ]
        ]

        x["mid_ex"] = mid_ex
        x["mid_best"] = mid_best

        x["ttm"] = (
            data["EXPIRATION"].dt.to_period("M")
            - data["QUOTE_DATETIME"].dt.to_period("M")
        ).apply(lambda x: x.n)

        # save num columns for scaler
        num_cols = x.columns.tolist()

        # impute with zeros
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        # x.fillna(0, inplace=True)

        # just copy
        x["option_type"] = data["OPTION_TYPE"]
        x["issue_type"] = data["issue_type"]
        x["root"] = data["ROOT"]

    x["buy_sell"] = data["buy_sell"].astype("int8")
    return x

## Write to file

In [None]:
name = f"{exchange}_{strategy}_{mode}"

dataset = wandb.Artifact(name=name, type="preprocessed_data")

if strategy == "supervised" or strategy == "unsupervised":
    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{name}/train_set.parquet"
    train = transform(train, mode)
    train.to_parquet(output_path)
    del train
    gc.collect()

    dataset.add_reference(output_path)

if strategy == "supervised":
    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{name}/val_set.parquet"
    val = transform(val, mode)
    val.to_parquet(output_path)
    del val
    gc.collect()
    dataset.add_reference(output_path)

if strategy == "supervised" or strategy == "transfer":
    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{name}/test_set.parquet"

    test = transform(test, mode)
    test.to_parquet(output_path)
    del test
    gc.collect()
    dataset.add_reference(output_path)

run.log_artifact(dataset)

In [None]:
# save scaler to pickle

if strategy == "supervised":
    scalers = {
        "scaler": scaler,
        "oe_option_type": oe_option_type,
        "oe_root": oe_root,
        "oe_issue_type": oe_issue_type,
    }
    uri_scalers = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{name}/scalers.sklearn"
    with fs.open(uri_scalers, "wb") as f:
        pickle.dump(scalers, f, protocol=4)

    # log scaler to wandb
    scaler = wandb.Artifact(name=f"{name}_scaler", type="scaler")
    scaler.add_reference(uri_scalers)
    run.log_artifact(scaler)

In [None]:
run.finish()

## Adversarial Validation
> Adversarial Validation is a technique allowing you to easily estimate the degree of difference between your training and test data. This technique was long rumored among Kaggle participants and transmitted from team to team until it emerged publicly thanks to a post by Zygmunt ZajÄ…c (https://www.kaggle.com/zygmunt) on his FastML blog. (adapted from Banchawicz et. al)

In [None]:
features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
    "buy_sell",  # add here and remove later
]

In [None]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet",
    engine="fastparquet",
    columns=features_classical_size,
)
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet",
    engine="fastparquet",
    columns=features_classical_size,
)

In [None]:
X = pd.concat([train, val])
X.drop(columns=["buy_sell"], inplace=True)
# assign zeros to train set and ones to test set
y = [0] * len(train) + [1] * len(val)

In [None]:
X.columns

In [None]:
# perform cv with catboost classifier
clf = CatBoostClassifier(
    task_type="GPU",
    logging_level="Silent",
    random_seed=42,
    eval_metric="Accuracy",
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
clf.fit(X_train, y_train, eval_set=(X_test, y_test))

In [None]:
y_pred = clf.predict(X_test)

In [None]:
# use mcc as data is imbalanced 3/4 train set, 1/4 val set
print(matthews_corrcoef(y_test, y_pred))

In [None]:
feature_importance = clf.get_feature_importance(
    prettified=True, type="FeatureImportance"
)
feature_importance

In [None]:
feature_importance.to_csv("feature_importance_gbm_classical_size.csv")

## Kolmogorov Smirnov

In [None]:
from scipy.stats import ks_2samp

cols = train.columns.tolist()
# cols.remove("buy_sell")
results = []

for col in cols:
    res = ks_2samp(train[col], val[col])

    results.append({"col": col, "static": res.statistic, "pvalue": res.pvalue})

results = pd.DataFrame(results)
results.to_csv("kolmogorov_smirnov.csv")

## Auto-Correlation

In [None]:
train = transform(train, mode="none")

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rc

from otc.features.build_features import features_classical_size

In [None]:
X = train[features_classical_size].head(200000)

In [None]:
cols = X.columns.tolist()

In [None]:
cols

In [None]:
cols_clearname = [
    "trade price",
    "bid (ex)",
    "ask (ex)",
    "ask (best)",
    "bid (best)",
    "price lag (ex)",
    "price lead (ex)",
    "price lag (all)",
    "price lead (all)",
    "price chg. lead (ex)",
    "price chg. lag (ex)",
    "price chg. lead (all)",
    "price chg. lag (all)",
    "prox (ex)",
    "prox (best)",
    "bid ask size ratio (ex)",
    "rel. bid size (ex)",
    "rel. ask size (ex)",
    "trade size",
    "bid size (ex)",
    "ask size (ex)",
    "depth (ex)",
]

In [None]:
len(cols_clearname)

In [None]:
list(zip(cols, cols_clearname))

In [None]:
params = {
    "pgf.texsystem": "xelatex",
    "pgf.rcfonts": False,
    "font.serif": [],
    "font.family": "serif",
    "font.sans-serif": [],
    "axes.labelsize": 11,
}

plt.rcParams.update(params)
rc("text", usetex=True)

plt.rc("text.latex", preamble=r"\usepackage{amsmath}\usepackage[utf8]{inputenc}")

cmap = mpl.colormaps.get_cmap("plasma")


# https://ranocha.de/blog/colors/
# Standard SciencePlots color cycle
mpl.rcParams["axes.prop_cycle"] = mpl.cycler(
    "color", ["0C5DA5", "00B945", "FF9500", "FF2C00", "845B97", "474747", "9e9e9e"]
)

# line cyclers adapted to colourblind people
from cycler import cycler

line_cycler = (
    cycler(
        color=[
            "#E69F00",
            "#56B4E9",
            "#009E73",
            "#0072B2",
            "#D55E00",
            "#CC79A7",
            "#F0E442",
        ]
    )  #  + cycler(linestyle=["-", "--", "-.", ":", "-", "--", "-."])
)
marker_cycler = (
    cycler(
        color=[
            "#E69F00",
            "#56B4E9",
            "#009E73",
            "#0072B2",
            "#D55E00",
            "#CC79A7",
            "#F0E442",
        ]
    )
    + cycler(linestyle=["none", "none", "none", "none", "none", "none", "none"])
    + cycler(marker=["4", "2", "3", "1", "+", "x", "."])
)

plt.rc("axes", prop_cycle=line_cycler)

In [None]:
# cols.remove("buy_sell")
print(cols)

CM = 1 / 2.54

(fig, ax) = plt.subplots(
    nrows=(len(cols) // 4) + 1,
    ncols=4,
    sharey=True,
    constrained_layout=True,
    figsize=(14 * CM, 14 * CM),
)

index = 0

for i, col in tqdm(enumerate(cols)):
    r = i // 4
    c = i % 4

    ax[r][c].acorr(X[col].astype(float), usevlines=True, normed=True, maxlags=20, lw=1)
    ax[r][c].set_title(cols_clearname[index])

    index += 1

# remove empty plots
fig.delaxes(ax[5][2])
fig.delaxes(ax[5][3])

plt.savefig(
    "../reports/Graphs/auto_corr_features.pdf",
    bbox_inches="tight",
)