In [2]:
import os
import gc

import gcsfs

import numpy as np
import numpy.typing as npt
import pandas as pd

import wandb

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PowerTransformer
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score


In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin
2023-01-23 11:32:41.019380: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


2023-01-23 11:32:47.400615: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# import google.auth
# from google.colab import auth

# connect to google cloud storage
# auth.authenticate_user()
# credentials, _ = google.auth.default()
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs = gcsfs.GCSFileSystem(project="thesis", credentials=credentials)




In [None]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
    "day_vol",
    "myn",
]


In [None]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_ultra_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_ultra_20.parquet",
    engine="fastparquet",
    columns=columns,
)




In [None]:
num_features = [
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "day_vol",
    "myn",
]


## Yeo Johnson Test

In [None]:
yeo_johnson = PowerTransformer(method="yeo-johnson")
yeo_johnson.fit(train[num_features])


In [None]:
lambdas = pd.Series(data=yeo_johnson.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.126414
TRADE_SIZE       -0.379786
TRADE_PRICE      -0.446928
BEST_BID         -0.442183
BEST_ASK         -0.442264
ask_ex           -0.441659
bid_ex           -0.441708
bid_size_ex       0.035735
ask_size_ex      -0.011599
price_all_lead   -0.447201
price_all_lag    -0.451723
price_ex_lead    -0.449795
price_ex_lag     -0.453784
day_vol          -0.210949
myn              -1.113007
dtype: float64

## Box Cox Test

In [None]:
train[num_features].min()


STRK_PRC          5.000000e-01
TRADE_SIZE        1.000000e+00
TRADE_PRICE       1.000000e-02
BEST_BID          0.000000e+00
BEST_ASK          0.000000e+00
ask_ex            0.000000e+00
bid_ex            0.000000e+00
bid_size_ex       0.000000e+00
ask_size_ex       0.000000e+00
price_all_lead    1.000000e-02
price_all_lag     1.000000e-02
price_ex_lead     1.000000e-02
price_ex_lag      1.000000e-02
day_vol           1.000000e+00
myn               3.725289e-07
dtype: float64

In [None]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features] + 1)


In [None]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.126414
TRADE_SIZE       -0.379786
TRADE_PRICE      -0.446928
BEST_BID         -0.442183
BEST_ASK         -0.442264
ask_ex           -0.441659
bid_ex           -0.441708
bid_size_ex       0.035735
ask_size_ex      -0.011599
price_all_lead   -0.447201
price_all_lag    -0.451723
price_ex_lead    -0.449795
price_ex_lag     -0.453784
day_vol          -0.210949
myn              -1.113007
dtype: float64

In [None]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features] + 0.01)


In [None]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas


STRK_PRC         -0.082968
TRADE_SIZE       -0.204893
TRADE_PRICE       0.060425
BEST_BID          0.110299
BEST_ASK          0.056837
ask_ex            0.049090
bid_ex            0.113067
bid_size_ex       0.140284
ask_size_ex       0.029876
price_all_lead    0.054975
price_all_lag     0.049097
price_ex_lead     0.053774
price_ex_lag      0.042910
day_vol          -0.116017
myn              -0.109716
dtype: float64

Use smallest possible constant for Box-Cox test. All $\lambda \approx 0 \implies \log(\cdot)$ for price, size, and quotes.

In [4]:
def sin_encode(x: pd.Series, period: int) -> npt.NDArray:
    """
    Encode a series with a sin function.

    Args:
        x (pd.Series): input series
        period (int): frequency

    Returns:
        npt.NDArray: encoded values
    """
    return np.sin(x * 2 * np.pi / period)


def cos_encode(x: pd.Series, period: int) -> npt.NDArray:
    """
    Encode a series with a sin function.

    Args:
        x (pd.Series): input series
        period (int): frequency

    Returns:
        npt.NDArray: encoded values
    """
    return np.cos(x * 2 * np.pi / period)


In [5]:
scaler = StandardScaler()
oe_option_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_root = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_issue_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)


def transform(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create features, impute, and scale.

    Args:
        data (pd.DataFrame): input data frame.
    Returns:
        pd.DataFrame: updated data frame.
    """

    # set up df, overwrite later
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x["depth_ex"] = data["bid_size_ex"] - data["ask_size_ex"]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])

    spread_ex = data["ask_ex"] - data["bid_ex"]
    spread_best = data["BEST_ASK"] - data["BEST_BID"]

    x["prox_ex"] = (data["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
    x["prox_best"] = (data["TRADE_PRICE"] - mid_best) / (0.5 * spread_best)

    # custom features
    x["spread_ex"] = spread_ex
    x["spread_best"] = spread_best
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]
    x["price_rel_nbo"] = (data["TRADE_PRICE"] - data["BEST_ASK"]) / (
        data["BEST_ASK"] - mid_best
    )
    x["price_rel_nbb"] = (data["TRADE_PRICE"] - data["BEST_BID"]) / (
        mid_best - data["BEST_BID"]
    )

    # calculate change
    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # asks = [f"ASK_{i}" for i in range(1, 17)]
    # bids = [f"BID_{i}" for i in range(1, 17)]
    
    # log transformed features
    x[
        [
            "ask_ex",
            "bid_ex",
            "BEST_ASK",
            "BEST_BID",
            "TRADE_PRICE",
            "price_all_lag",
            "price_all_lead",
            "price_ex_lag",
            "price_ex_lead",
            "TRADE_SIZE",
            "bid_size_ex",
            "ask_size_ex",
            "day_vol",
            "myn",
            "STRK_PRC",
            # *asks,
            # *bids
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
                # *asks,
                # *bids
            ]
        ]
    )
    x["mid_ex"] = np.log1p(mid_ex)
    x["mid_best"] = np.log1p(mid_best)

    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)

    # save num columns for scaler
    num_cols = x.columns.tolist()

    # date features
    x["date_year"] = data["QUOTE_DATETIME"].dt.year

    months_in_year = 12
    x["date_month_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)
    x["date_month_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)

    days_in_month = 31  # at max :-)
    x["date_day_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.day, days_in_month)
    x["date_day_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.day, days_in_month)

    days_in_week = 7
    x["date_weekday_sin"] = sin_encode(
        data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
    )
    x["date_weekday_cos"] = cos_encode(
        data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
    )

    seconds_in_day = 24 * 60 * 60
    seconds = (
        data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    ).dt.total_seconds()

    x["date_time_sin"] = sin_encode(seconds, seconds_in_day)
    x["date_time_cos"] = cos_encode(seconds, seconds_in_day)

    # impute with zeros
    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    x.fillna(0, inplace=True)

    # standardize continous columns (w/o date features)
    # bin encode categorical features
    try:
        x[num_cols] = scaler.transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("transform (val + test)")
    except NotFittedError as e:
        x[num_cols] = scaler.fit_transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.fit_transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.fit_transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.fit_transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("fit_transform (train)")

    x["buy_sell"] = data["buy_sell"]
    return x


## Write to file

In [6]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

name = "ise_log_standardized"

train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_ultra_60.parquet",
    engine="fastparquet",
)

output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/train_set_60.parquet"
)
train = transform(train)
train.to_parquet(output_path)
del train
gc.collect()

val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_ultra_20.parquet",
    engine="fastparquet",
)

output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/val_set_20.parquet"
)
val = transform(val)
val.to_parquet(output_path)
del val
gc.collect()

test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_ultra_20.parquet",
    engine="fastparquet",
)

output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/test_set_20.parquet"
)
test = transform(test)
test.to_parquet(output_path)




fit_transform (train)
transform (val + test)
transform (val + test)


In [7]:
name = "ise_log_standardized"
dataset = wandb.Artifact(name=name, type="preprocessed_data")
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet"
)
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet"
)
dataset.add_reference(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/test_set_20.parquet"
)
run.log_artifact(dataset)




<wandb.sdk.wandb_artifacts.Artifact at 0x14d575f70250>

In [8]:
run.finish()


## Adversarial Validation
> Adversarial Validation is a technique allowing you to easily estimate the degree of difference between your training and test data. This technique was long rumored among Kaggle participants and transmitted from team to team until it emerged publicly thanks to a post by Zygmunt Zając (https://www.kaggle.com/zygmunt) on his FastML blog. (adapted from Banchawicz et. al)

In [None]:
X = train.append(test)
X.drop(columns=["buy_sell", "date_year"], inplace=True)
# assign zeros to train set and ones to test set
y = [0] * len(train) + [1] * len(test)


  X = train.append(test)


In [None]:
X.columns


Index(['TRADE_PRICE', 'bid_ask_size_ratio_ex', 'rel_bid_size_ex',
       'rel_ask_size_ex', 'depth_ex', 'prox_ex', 'prox_best', 'spread_ex',
       'spread_best', 'bid_ask_ratio_ex', 'price_rel_nbo', 'price_rel_nbb',
       'chg_ex_lead', 'chg_ex_lag', 'chg_all_lead', 'chg_all_lag', 'ask_ex',
       'bid_ex', 'BEST_ASK', 'BEST_BID', 'price_all_lag', 'price_all_lead',
       'price_ex_lag', 'price_ex_lead', 'TRADE_SIZE', 'bid_size_ex',
       'ask_size_ex', 'day_vol', 'myn', 'STRK_PRC', 'mid_ex', 'mid_best',
       'ttm', 'date_month_sin', 'date_month_cos', 'date_day_sin',
       'date_day_cos', 'date_weekday_sin', 'date_weekday_cos', 'date_time_sin',
       'date_time_cos', 'bin_option_type', 'bin_issue_type', 'bin_root'],
      dtype='object')

In [None]:
# perform cv with catboost classifier
cat_features = ["bin_option_type", "bin_root", "bin_issue_type"]

model = CatBoostClassifier(
    max_depth=4,
    task_type="CPU",
    # cat_features=cat_features,
    logging_level="Silent",
)


In [None]:
cv_results = cross_validate(model, X, y, cv=3, return_estimator=True)


In [None]:
print(cv_results)


{'fit_time': array([878.2867291 , 984.46557117, 990.86366677]), 'score_time': array([5.86397052, 4.73058748, 5.01639771]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x147fbb147b50>, <catboost.core.CatBoostClassifier object at 0x147fb88c4e80>, <catboost.core.CatBoostClassifier object at 0x147fb88caf70>], 'test_score': array([0.79481789, 0.66661843, 0.69676893])}


In [None]:
cv_results["estimator"][0].get_feature_importance(prettified=True)


Unnamed: 0,Feature Id,Importances
0,date_day_sin,24.495673
1,date_month_sin,18.708696
2,date_day_cos,15.879051
3,date_month_cos,11.996232
4,date_weekday_sin,9.701744
5,date_weekday_cos,9.24232
6,bin_root,3.893286
7,STRK_PRC,2.008321
8,spread_best,0.462539
9,bin_issue_type,0.424255
