Run `pip install .` first to install all dependencies.

In [36]:
import os

from catboost import CatBoostClassifier
import gcsfs

import numpy as np
import pandas as pd

import wandb

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PowerTransformer
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

In [9]:
# # connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [10]:
import google.auth
from google.colab import auth

# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
# fs = gcsfs.GCSFileSystem(project="thesis")
fs = gcsfs.GCSFileSystem(project="thesis", credentials=credentials)


In [15]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
    "day_vol",
    "myn"
]


In [16]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)


In [17]:
num_features = [
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "day_vol",
    "myn",
]


## Yeo Johnson Test

In [18]:
yeo_johnson = PowerTransformer(method="yeo-johnson")
yeo_johnson.fit(train[num_features])


PowerTransformer()

In [19]:
lambdas = pd.Series(data=yeo_johnson.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.134935
TRADE_SIZE       -0.376653
TRADE_PRICE      -0.450919
BEST_BID         -0.445661
BEST_ASK         -0.446576
ask_ex           -0.446723
bid_ex           -0.445381
bid_size_ex       0.037945
ask_size_ex      -0.011581
price_all_lead   -0.451470
price_all_lag    -0.457447
price_ex_lead    -0.453580
price_ex_lag     -0.459588
day_vol          -0.208610
myn              -1.125279
dtype: float64

## Box Cox Test

In [20]:
train[num_features].min()


STRK_PRC          0.500000
TRADE_SIZE        1.000000
TRADE_PRICE       0.010000
BEST_BID          0.000000
BEST_ASK          0.000000
ask_ex            0.000000
bid_ex            0.000000
bid_size_ex       0.000000
ask_size_ex       0.000000
price_all_lead    0.010000
price_all_lag     0.010000
price_ex_lead     0.010000
price_ex_lag      0.010000
day_vol           1.000000
myn               0.000006
dtype: float64

In [21]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features] + 1)


PowerTransformer(method='box-cox')

In [22]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.134935
TRADE_SIZE       -0.376653
TRADE_PRICE      -0.450919
BEST_BID         -0.445661
BEST_ASK         -0.446576
ask_ex           -0.446723
bid_ex           -0.445381
bid_size_ex       0.037945
ask_size_ex      -0.011581
price_all_lead   -0.451470
price_all_lag    -0.457447
price_ex_lead    -0.453580
price_ex_lag     -0.459588
day_vol          -0.208610
myn              -1.125279
dtype: float64

In [23]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features] + 0.01)


PowerTransformer(method='box-cox')

In [24]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.091617
TRADE_SIZE       -0.202993
TRADE_PRICE       0.056146
BEST_BID          0.106694
BEST_ASK          0.052514
ask_ex            0.043842
bid_ex            0.109440
bid_size_ex       0.142543
ask_size_ex       0.029668
price_all_lead    0.049490
price_all_lag     0.042361
price_ex_lead     0.047954
price_ex_lag      0.035568
day_vol          -0.114507
myn               0.088899
dtype: float64

Use smallest possible constant for Box-Cox test. All $\lambda \approx 0 \implies \log(\cdot)$ for price, size, and quotes.

In [25]:
def sin_encode(x, period):
    return np.sin(x * 2 * np.pi / period)


def cos_encode(x, period):
    return np.cos(x * 2 * np.pi / period)


In [26]:
scaler = StandardScaler()
oe_option_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_root = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_issue_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)


def transform(data: pd.DataFrame) -> pd.DataFrame:

    # set up df, overwrite later
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x["depth_ex"] = data["bid_size_ex"] - data["ask_size_ex"]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])

    spread_ex = data["ask_ex"] - data["bid_ex"]
    spread_best = data["BEST_ASK"] - data["BEST_BID"]

    x["prox_ex"] = (data["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
    x["prox_best"] = (data["TRADE_PRICE"] - mid_best) / (0.5 * spread_best)

    # custom features
    x["spread_ex"] = spread_ex
    x["spread_best"] = spread_best
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]
    x["price_rel_nbo"] = (data["TRADE_PRICE"] - data["BEST_ASK"]) / (
        data["BEST_ASK"] - mid_best
    )
    x["price_rel_nbb"] = (data["TRADE_PRICE"] - data["BEST_BID"]) / (
        mid_best - data["BEST_BID"]
    )

    # calculate change
    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # log transformed features
    x[
        [
            "ask_ex",
            "bid_ex",
            "BEST_ASK",
            "BEST_BID",
            "TRADE_PRICE",
            "price_all_lag",
            "price_all_lead",
            "price_ex_lag",
            "price_ex_lead",
            "TRADE_SIZE",
            "bid_size_ex",
            "ask_size_ex",
            "day_vol",
            "myn",
            "STRK_PRC",
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
            ]
        ]
    )
    x["mid_ex"] = np.log1p(mid_ex)
    x["mid_best"] = np.log1p(mid_best)

    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)

    # save num columns for scaler
    num_cols = x.columns.tolist()

    # date features
    x["date_year"] = data["QUOTE_DATETIME"].dt.year

    months_in_year = 12
    x["date_month_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)
    x["date_month_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)

    days_in_month = 31  # at max :-)
    x["date_day_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.day,days_in_month)
    x["date_day_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.day,days_in_month)

    days_in_week = 7
    x["date_weekday_sin"] = sin_encode(
        data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
    )
    x["date_weekday_cos"] = cos_encode(
        data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
    )

    seconds_in_day = 24 * 60 * 60
    seconds = (
        data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    ).dt.total_seconds()

    x["date_time_sin"] = sin_encode(seconds, seconds_in_day)
    x["date_time_cos"] = cos_encode(seconds, seconds_in_day)

    # impute with zeros
    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    x.fillna(0, inplace=True)

    # standardize continous columns (w/o date features)
    # bin encode categorical features
    try:
        x[num_cols] = scaler.transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("transform (val + test)")
    except NotFittedError as e:
        x[num_cols] = scaler.fit_transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.fit_transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.fit_transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.fit_transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("fit_transform (train)")

    x["buy_sell"] = data["buy_sell"]
    return x


## Write to file

In [27]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet",
    engine="fastparquet",
)


In [28]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

name = "ise_log_standardized"

output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/train_set_60.parquet"
)
print(output_path)
train = transform(train)
train.to_parquet(output_path)

# del train
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/val_set_20.parquet"
)
print(output_path)
val = transform(val)
val.to_parquet(output_path)

# del val
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/test_set_20.parquet"
)
print(output_path)
test = transform(test)
test.to_parquet(output_path)

gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet
fit_transform (train)
gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet
transform (val + test)
gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/test_set_20.parquet
transform (val + test)


In [29]:
test.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TRADE_PRICE,9862.0,-0.127719,1.051364,-1.271534,-0.957352,-0.4090461,0.434107,5.901716
bid_ask_size_ratio_ex,9862.0,-0.080228,0.399626,-0.134744,-0.133191,-0.128783,-0.121091,15.965639
rel_bid_size_ex,9862.0,0.036328,0.923522,-0.094811,-0.089605,-0.07146615,0.012575,80.444404
rel_ask_size_ex,9862.0,0.042042,0.66346,-0.100742,-0.093361,-0.07368009,0.028122,31.019897
depth_ex,9862.0,0.006389,0.195329,-2.013662,-0.007083,0.01622931,0.030799,2.036796
prox_ex,9862.0,0.024052,0.676975,-6.912134,-0.312554,0.02786476,0.313168,3.273189
prox_best,9862.0,0.057083,1.014078,-21.591279,-0.276596,0.02739034,0.37608,22.111119
spread_ex,9862.0,0.687985,2.303079,-0.469606,-0.335849,-0.1352128,0.533574,42.555673
spread_best,9862.0,0.414367,1.828116,-5.204529,-0.39955,-0.2206413,0.418319,34.538796
bid_ask_ratio_ex,9862.0,-0.416018,1.231967,-3.539147,-0.737489,0.1151894,0.429869,0.655601


In [30]:
test.head().T


Unnamed: 0,43052254,41396619,43146199,39592424,41640434
TRADE_PRICE,-0.3388609,-0.3898808,-0.2901371,-1.209047,-0.9026129
bid_ask_size_ratio_ex,0.07015891,-0.1347214,-0.1302594,-0.134744,-0.1318558
rel_bid_size_ex,-0.08667558,0.1736532,-0.05350869,-0.094811,-0.02769484
rel_ask_size_ex,0.1698727,-0.09976528,-0.07067327,-0.099821,-0.06927475
depth_ex,0.07276089,-0.7880344,0.01389811,-0.154531,0.00340771
prox_ex,1.276067,-1.220335,0.02786476,-0.249513,-1.220339
prox_best,1.189691,-1.134908,0.02739034,-0.314463,-1.134914
spread_ex,-0.3358492,-0.2912632,0.4221093,-0.068334,-0.4250206
spread_best,-0.3228754,-0.2717584,0.5461111,-0.041733,-0.4506673
bid_ask_ratio_ex,0.4819378,0.4161347,-0.3254807,-3.539147,0.4723178


In [None]:
name = "ise_log_standardized"
dataset = wandb.Artifact(name=name, type="preprocessed_data")
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet"
)
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet"
)
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/test_set_20.parquet"
)
run.log_artifact(dataset)


<wandb.sdk.wandb_artifacts.Artifact at 0x7f3bdc88c460>

In [None]:
name = "ise_standardized"
dataset = wandb.Artifact(name=name, type="preprocessed_data")
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_standardized/train_set_60.parquet"
)
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_standardized/val_set_20.parquet"
)
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_standardized/test_set_20.parquet"
)
run.log_artifact(dataset)


<wandb.sdk.wandb_artifacts.Artifact at 0x7f3bdd718250>

In [None]:
run.finish()


## Adversarial Validation

In [45]:
X = train.append(test)
X.drop(columns=["buy_sell", "date_year"], inplace=True)
# assign zeros to train set and ones to test set
y = [0] * len(train) + [1] * len(test)

In [46]:
X.columns

Index(['TRADE_PRICE', 'bid_ask_size_ratio_ex', 'rel_bid_size_ex',
       'rel_ask_size_ex', 'depth_ex', 'prox_ex', 'prox_best', 'spread_ex',
       'spread_best', 'bid_ask_ratio_ex', 'price_rel_nbo', 'price_rel_nbb',
       'chg_ex_lead', 'chg_ex_lag', 'chg_all_lead', 'chg_all_lag', 'ask_ex',
       'bid_ex', 'BEST_ASK', 'BEST_BID', 'price_all_lag', 'price_all_lead',
       'price_ex_lag', 'price_ex_lead', 'TRADE_SIZE', 'bid_size_ex',
       'ask_size_ex', 'day_vol', 'myn', 'STRK_PRC', 'mid_ex', 'mid_best',
       'ttm', 'date_month_sin', 'date_month_cos', 'date_day_sin',
       'date_day_cos', 'date_weekday_sin', 'date_weekday_cos', 'date_time_sin',
       'date_time_cos', 'bin_option_type', 'bin_issue_type', 'bin_root'],
      dtype='object')

In [47]:
# perform cv with catboost classifier
cat_features = ["bin_option_type", "bin_root", "bin_issue_type"]

model = CatBoostClassifier(
    max_depth=8,
    task_type="GPU",
    cat_features=cat_features,
    logging_level="Silent",
)

In [48]:
cv_results = cross_validate(model, X, y, cv=3, return_estimator=True)

In [49]:
print(cv_results)

{'fit_time': array([134.22796702, 134.53925419, 132.22584367]), 'score_time': array([0.21852708, 0.23673105, 0.25619888]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7ff093a71d00>, <catboost.core.CatBoostClassifier object at 0x7ff09845e6d0>, <catboost.core.CatBoostClassifier object at 0x7ff093b00c40>], 'test_score': array([0.91923194, 0.92128924, 0.93180433])}


In [52]:
cv_results["estimator"][0].get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,bin_root,24.740223
1,STRK_PRC,8.965242
2,bid_ask_ratio_ex,7.411078
3,bid_size_ex,6.054282
4,spread_ex,5.253324
5,prox_ex,5.081186
6,myn,5.022512
7,ask_size_ex,4.739555
8,spread_best,3.731043
9,bin_issue_type,2.558918
