In [1]:
import os
import gc

import gcsfs

import numpy as np
import numpy.typing as npt
import pandas as pd

from pathlib import Path

import wandb

from catboost import CatBoostClassifier, Pool

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PowerTransformer
from sklearn.exceptions import NotFittedError
# from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef

import pickle
import google.auth

In [2]:
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)



In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
  warn("The `IPython.html` package has been deprecated since IPython 4.0. "
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# set here globally
seed = 42

exchange = "cboe" # "ise"
strategy = "supervised" # "transfer" # "unsupervised"

In [5]:
dataset = f"fbv/thesis/{exchange}_{strategy}_raw:latest"

os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
run = wandb.init(project="thesis", entity="fbv")

# load unscaled data
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact cboe_supervised_raw:latest, 1937.19MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0


In [6]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "issue_type",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
    "day_vol",
    "myn",
]


In [7]:
if strategy == "supervised":
    train = pd.read_parquet(
        Path(data_dir, "train_set"), engine="fastparquet", columns=columns
    )
    val = pd.read_parquet(
        Path(data_dir, "val_set"), engine="fastparquet", columns=columns
    )
    test = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )
    
elif strategy == "unsupervised":
    # load unlabelled training set
    train = pd.read_parquet(
        Path(data_dir, "train_set"), engine="fastparquet", columns=columns
    )

elif strategy == "transfer":
    # load test set
    test = pd.read_parquet(
        Path(data_dir, "test_set"), engine="fastparquet", columns=columns
    )


In [8]:
num_features = [
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "day_vol",
    "myn",
]


## Box Cox Test

In [13]:
train[num_features].min()


STRK_PRC          5.000000e-01
TRADE_SIZE        1.000000e+00
TRADE_PRICE       1.000000e-02
BEST_BID          0.000000e+00
BEST_ASK          0.000000e+00
ask_ex            0.000000e+00
bid_ex            0.000000e+00
bid_size_ex       0.000000e+00
ask_size_ex       0.000000e+00
price_all_lead    1.000000e-02
price_all_lag     1.000000e-02
price_ex_lead     1.000000e-02
price_ex_lag      1.000000e-02
day_vol           1.000000e+00
myn               3.725289e-07
dtype: float64

In [14]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features] + 0.01)


In [15]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.082968
TRADE_SIZE       -0.204893
TRADE_PRICE       0.060425
BEST_BID          0.110299
BEST_ASK          0.056837
ask_ex            0.049090
bid_ex            0.113067
bid_size_ex       0.140284
ask_size_ex       0.029876
price_all_lead    0.054975
price_all_lag     0.049097
price_ex_lead     0.053774
price_ex_lag      0.042910
day_vol          -0.116017
myn              -0.109716
dtype: float64

Use smallest possible constant for Box-Cox test. All $\lambda \approx 0 \implies \log(\cdot)$ for price, size, and quotes.

In [9]:
if strategy == "supervised":
    scaler = StandardScaler()
    oe_option_type = OrdinalEncoder(
        unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
    )
    oe_root = OrdinalEncoder(
        unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
    )
    oe_issue_type = OrdinalEncoder(
       unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
    )

else:
    # if mode transfer or mode unsupervised -> use scaler from ise supervised dataset
    # if mode supervised -> fit scaler on ise / cboe training set and apply on validation and test set

    # TODO: Fix if I get unlabelled CBOE dataset
    artifact = run.use_artifact("fbv/thesis/ise_supervised_log_standardized_scaler:latest")
    scaler_dir = artifact.download()
    scalers = pickle.load(open(Path(scaler_dir,"ise_supervised_scalers.sklearn"), 'rb'))
    
    # set fitted scalers
    scaler = scalers["scaler"]
    oe_option_type = scalers["oe_option_type"]
    oe_root = scalers["oe_root"]
    oe_issue_type = scalers["oe_issue_type"]


def transform(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create features, impute, and scale.

    Args:
        data (pd.DataFrame): input data frame.
    Returns:
        pd.DataFrame: updated data frame.
    """

    # set up df, overwrite later
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x["depth_ex"] = data["bid_size_ex"] - data["ask_size_ex"]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])

    spread_ex = data["ask_ex"] - data["bid_ex"]
    spread_best = data["BEST_ASK"] - data["BEST_BID"]

    x["prox_ex"] = (data["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
    x["prox_best"] = (data["TRADE_PRICE"] - mid_best) / (0.5 * spread_best)

    # custom features
    x["spread_ex"] = spread_ex
    x["spread_best"] = spread_best
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]
    # x["price_rel_nbo"] = (data["TRADE_PRICE"] - data["BEST_ASK"]) / (
    #     data["BEST_ASK"] - mid_best
    # )
    # x["price_rel_nbb"] = (data["TRADE_PRICE"] - data["BEST_BID"]) / (
    #     mid_best - data["BEST_BID"]
    # )

    # calculate change
    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # asks = [f"ASK_{i}" for i in range(1, 17)]
    # bids = [f"BID_{i}" for i in range(1, 17)]
    
    # log transformed features
    x[
        [
            "ask_ex",
            "bid_ex",
            "BEST_ASK",
            "BEST_BID",
            "TRADE_PRICE",
            "price_all_lag",
            "price_all_lead",
            "price_ex_lag",
            "price_ex_lead",
            "TRADE_SIZE",
            "bid_size_ex",
            "ask_size_ex",
            "day_vol",
            "myn",
            "STRK_PRC",
            # *asks,
            # *bids
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
                # *asks,
                # *bids
            ]
        ]
    )
    x["mid_ex"] = np.log1p(mid_ex)
    x["mid_best"] = np.log1p(mid_best)

    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)

    # save num columns for scaler
    num_cols = x.columns.tolist()

#     # date features
#     x["date_year"] = data["QUOTE_DATETIME"].dt.year

#     months_in_year = 12
#     x["date_month_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)
#     x["date_month_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)

#     days_in_month = 31  # at max :-)
#     x["date_day_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.day, days_in_month)
#     x["date_day_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.day, days_in_month)

#     days_in_week = 7
#     x["date_weekday_sin"] = sin_encode(
#         data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
#     )
#     x["date_weekday_cos"] = cos_encode(
#         data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
#     )

#     seconds_in_day = 24 * 60 * 60
#     seconds = (
#         data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
#     ).dt.total_seconds()

#     x["date_time_sin"] = sin_encode(seconds, seconds_in_day)
#     x["date_time_cos"] = cos_encode(seconds, seconds_in_day)

    # impute with zeros
    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    x.fillna(0, inplace=True)

    # standardize continous columns (w/o date features)
    # bin encode categorical features
    try:
        x[num_cols] = scaler.transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("transform (val + test)")
    except NotFittedError as e:
        x[num_cols] = scaler.fit_transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.fit_transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.fit_transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.fit_transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("fit_transform (train)")

    x["buy_sell"] = data["buy_sell"].astype("int8")
    return x


## Write to file

In [10]:
name = f"{exchange}_{strategy}_log_standardized"

dataset = wandb.Artifact(name=name, type="preprocessed_data")

if strategy == "supervised" or strategy == "unsupervised":
    output_path = (
        f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train_log_standardized.parquet"
    )
    train = transform(train)
    train.to_parquet(output_path)
    del train
    gc.collect()

    dataset.add_reference(output_path)

if strategy == "supervised":
    output_path = (
        f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_val_log_standardized.parquet"
    )
    val = transform(val)
    val.to_parquet(output_path)
    del val
    gc.collect()
    dataset.add_reference(output_path)

if strategy == "supervised" or strategy == "transfer":
    output_path = (
        f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_test_log_standardized.parquet"
    )

    test = transform(test)
    test.to_parquet(output_path)
    del test
    gc.collect()
    dataset.add_reference(output_path)

run.log_artifact(dataset)


  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


fit_transform (train)




transform (val + test)
transform (val + test)


<wandb.sdk.wandb_artifacts.Artifact at 0x14986acd4fd0>

In [11]:
# save scaler to pickle

if strategy == "supervised":
    
    scalers = {"scaler": scaler, "oe_option_type": oe_option_type,"oe_root": oe_root, "oe_issue_type": oe_issue_type}    
    uri_scalers = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_scalers.sklearn"
    with fs.open(uri_scalers, "wb") as f:
        pickle.dump(scalers, f, protocol=4)  # type: ignore

    # log scaler to wandb
    scaler = wandb.Artifact(name=f"{name}_scaler", type="scaler")
    scaler.add_reference(uri_scalers)
    run.log_artifact(scaler)    



In [12]:
run.finish()

## Adversarial Validation
> Adversarial Validation is a technique allowing you to easily estimate the degree of difference between your training and test data. This technique was long rumored among Kaggle participants and transmitted from team to team until it emerged publicly thanks to a post by Zygmunt Zając (https://www.kaggle.com/zygmunt) on his FastML blog. (adapted from Banchawicz et. al)

In [2]:
features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
    "buy_sell", # add here and remove later
]

In [3]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet",
    engine="fastparquet", columns=features_classical_size,
)
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet",
    engine="fastparquet", columns=features_classical_size,
)



In [115]:
X = pd.concat([train, val])
X.drop(columns=["buy_sell"], inplace=True)
# assign zeros to train set and ones to test set
y = [0] * len(train) + [1] * len(val)

In [5]:
X.columns

Index(['TRADE_PRICE', 'bid_ex', 'ask_ex', 'BEST_ASK', 'BEST_BID',
       'price_ex_lag', 'price_ex_lead', 'price_all_lag', 'price_all_lead',
       'chg_ex_lead', 'chg_ex_lag', 'chg_all_lead', 'chg_all_lag', 'prox_ex',
       'prox_best', 'bid_ask_size_ratio_ex', 'rel_bid_size_ex',
       'rel_ask_size_ex', 'TRADE_SIZE', 'bid_size_ex', 'ask_size_ex',
       'depth_ex'],
      dtype='object')

In [6]:
# perform cv with catboost classifier
clf = CatBoostClassifier(
    task_type="GPU",
    logging_level="Silent",
	random_seed= 42,
    eval_metric="Accuracy",
)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
clf.fit(X_train, y_train, eval_set=(X_test,y_test))

<catboost.core.CatBoostClassifier at 0x14e7e5828490>

In [9]:
y_pred = clf.predict(X_test)

In [10]:
# use mcc as data is imbalanced 3/4 train set, 1/4 val set
print(matthews_corrcoef(y_test, y_pred))

0.3789492800772903


In [11]:
feature_importance = clf.get_feature_importance(prettified=True, type="FeatureImportance")
feature_importance

Unnamed: 0,Feature Id,Importances
0,prox_best,19.640974
1,prox_ex,13.733821
2,chg_all_lag,9.785274
3,chg_all_lead,9.090985
4,bid_size_ex,6.777612
5,ask_size_ex,5.955042
6,price_ex_lead,5.087272
7,ask_ex,4.229072
8,chg_ex_lead,3.645325
9,chg_ex_lag,3.624959


In [12]:
feature_importance.to_csv("feature_importance_gbm_classical_size.csv")

## Kolmogorov Smirnov

In [114]:
from scipy.stats import ks_2samp

cols = train.columns.tolist()
# cols.remove("buy_sell")
results = []

for col in cols: 
    res = ks_2samp(train[col], val[col])
    
    results.append({"col":col, "static": res.statistic, "pvalue":res.pvalue})

results = pd.DataFrame(results)
results.to_csv("kolmogorov_smirnov.csv")

Unnamed: 0,col,static,pvalue
0,TRADE_PRICE,0.383333,0.009918
1,bid_ex,0.316667,0.053969
2,ask_ex,0.333333,0.036511
3,BEST_ASK,0.358333,0.019509
4,BEST_BID,0.333333,0.036511
5,price_ex_lag,0.35,0.024173
6,price_ex_lead,0.416667,0.003717
7,price_all_lag,0.383333,0.009918
8,price_all_lead,0.425,0.002866
9,chg_ex_lead,0.35,0.024173


In [43]:
from numpy.fft import fft, ifft,fftshift


In [56]:
# data = numpy.arange(5*4).reshape(5, 4)
# print data
# ##[[ 0  1  2  3]
# ## [ 4  5  6  7]
# ## [ 8  9 10 11]
# ## [12 13 14 15]
# ## [16 17 18 19]]

l = 2 ** int(np.log2(foo.shape[1] * 2 - 1))
print(l)
fftx = fft(foo, n = l, axis = 1)
ret = ifft(fftx * np.conjugate(fftx), axis = 1).real
# ret = fftshift(ret, axes=1)

print(ret.shape)
# dataFT = fft(foo, axis=1)
# dataAC = ifft(dataFT * np.conjugate(dataFT), axis=1).real

2
(120, 2)


In [39]:
dataAC.shape

(100, 23)

## Auto-Correlation

In [13]:
import matplotlib.pyplot as plt

In [None]:
cols = train.columns.tolist()
# cols.remove("buy_sell")

CM = 1 / 2.54

(fig, ax) = plt.subplots(nrows=(len(cols) //4) + 1, ncols=4, sharey=True, constrained_layout=True, figsize=(14*CM, 14 *CM))

for i, col in enumerate(cols):
    
    r = i // 4
    c = i % 4
    
    ax[r][c].acorr(X[col], usevlines=True, normed=True, maxlags=20, lw=1)
    ax[r][c].set_title(col)

 

# remove empty plots
fig.delaxes(ax[5][2])
fig.delaxes(ax[5][3])

plt.savefig(
    f"auto_corr_features.pdf",
    bbox_inches="tight",
)