Run `pip install .` first to install all dependencies.

In [1]:
!pip install gcsfs==2022.10.0
!pip install google-auth==2.15.0
!pip install psutil==5.9.4
!pip install wandb
!pip install fastparquet
!pip install numpy
!pip install pandas
!pip install catboost
!pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gcsfs==2022.10.0
  Downloading gcsfs-2022.10.0-py2.py3-none-any.whl (25 kB)
Collecting fsspec==2022.10.0
  Downloading fsspec-2022.10.0-py3-none-any.whl (138 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.8/138.8 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fsspec, gcsfs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2022.11.0
    Uninstalling fsspec-2022.11.0:
      Successfully uninstalled fsspec-2022.11.0
Successfully installed fsspec-2022.10.0 gcsfs-2022.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting psutil==5.9.4
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.7-py2.py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.30-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.0/184.0 KB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.12.1-py2.py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/

In [2]:
import os

from catboost import CatBoostClassifier, Pool
import numpy as np

import gcsfs
import numpy as np
import pandas as pd

import wandb

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PowerTransformer
from sklearn.exceptions import NotFittedError

from scipy import stats

In [3]:
# # connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [7]:
import google.auth
from google.colab import auth
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
#fs = gcsfs.GCSFileSystem(project="thesis")
fs = gcsfs.GCSFileSystem(project="thesis", credentials=credentials)

In [None]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
]


In [None]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)

In [None]:
num_features = [
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "day_vol", 
    "myn"
]

## Yeo Johnson Test

In [None]:
yeo_johnson = PowerTransformer(method="yeo-johnson")
yeo_johnson.fit(train[num_features])

In [None]:
lambdas = pd.Series(data=yeo_johnson.lambdas_, index=num_features)
lambdas

STRK_PRC         -0.126414
TRADE_SIZE       -0.379786
TRADE_PRICE      -0.446928
BEST_BID         -0.442183
BEST_ASK         -0.442264
ask_ex           -0.441659
bid_ex           -0.441708
bid_size_ex       0.035735
ask_size_ex      -0.011599
price_all_lead   -0.447201
price_all_lag    -0.451723
price_ex_lead    -0.449795
price_ex_lag     -0.453784
dtype: float64

## Box Cox Test

In [None]:
train[num_features].min()

STRK_PRC          0.50
TRADE_SIZE        1.00
TRADE_PRICE       0.01
BEST_BID          0.00
BEST_ASK          0.00
ask_ex            0.00
bid_ex            0.00
bid_size_ex       0.00
ask_size_ex       0.00
price_all_lead    0.01
price_all_lag     0.01
price_ex_lead     0.01
price_ex_lag      0.01
dtype: float64

In [None]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features]+1)

In [None]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas

STRK_PRC         -0.126414
TRADE_SIZE       -0.379786
TRADE_PRICE      -0.446928
BEST_BID         -0.442183
BEST_ASK         -0.442264
ask_ex           -0.441659
bid_ex           -0.441708
bid_size_ex       0.035735
ask_size_ex      -0.011599
price_all_lead   -0.447201
price_all_lag    -0.451723
price_ex_lead    -0.449795
price_ex_lag     -0.453784
dtype: float64

In [None]:
box_cox = PowerTransformer(method="box-cox")
# add constant as box cox works only on positive data
box_cox.fit(train[num_features]+0.01)

In [None]:
lambdas = pd.Series(data=box_cox.lambdas_, index=num_features)
lambdas

STRK_PRC         -0.082968
TRADE_SIZE       -0.204893
TRADE_PRICE       0.060425
BEST_BID          0.110299
BEST_ASK          0.056837
ask_ex            0.049090
bid_ex            0.113067
bid_size_ex       0.140284
ask_size_ex       0.029876
price_all_lead    0.054975
price_all_lag     0.049097
price_ex_lead     0.053774
price_ex_lag      0.042910
dtype: float64

## Prepare dataset

In [None]:
def sin_encode(x, period):
    return np.sin(x * 2 * np.pi / period)

def cos_encode(x, period):
    return  np.cos(x * 2 * np.pi / period)


In [None]:
scaler = StandardScaler()
oe_option_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_root = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_issue_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)


def transform(data: pd.DataFrame) -> pd.DataFrame:

    # set up df, overwrite later
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x["depth_ex"] = data["bid_size_ex"] - data["ask_size_ex"]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])
    spread_ex = data["ask_ex"] - data["bid_ex"]
    spread_best = data["BEST_ASK"] - data["BEST_BID"]

    x["prox_ex"] = (data["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex)
    x["prox_best"] = (data["TRADE_PRICE"] - mid_best) / (0.5 * spread_best)

    # custom features
    x["spread_ex"] = spread_ex
    x["spread_best"] = spread_best
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]
    x["price_rel_nbo"] = (data["TRADE_PRICE"] - data["BEST_ASK"]) / (
        data["BEST_ASK"] - mid_best
    )
    x["price_rel_nbb"] = (data["TRADE_PRICE"] - data["BEST_BID"]) / (
        mid_best - data["BEST_BID"]
    )

    # calculate change
    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # log transformed features
    x[
        [
            "ask_ex",
            "bid_ex",
            "BEST_ASK",
            "BEST_BID",
            "TRADE_PRICE",
            "price_all_lag",
            "price_all_lead",
            "price_ex_lag",
            "price_ex_lead",
            "TRADE_SIZE",
            "bid_size_ex",
            "ask_size_ex",
            "day_vol",
            "myn",
            "STRK_PRC",
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
                "day_vol",
                "myn",
                "STRK_PRC",
            ]
        ]
    )

    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)

    # save num columns for scaler
    num_cols = x.columns.tolist()

    # date features
    x["date_year"] = data["QUOTE_DATETIME"].dt.year

    months_in_year = 12
    x["date_month_sin"] = sin_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)
    x["date_month_cos"] = cos_encode(data["QUOTE_DATETIME"].dt.month, months_in_year)

    days_in_week = 7
    x["date_weekday_sin"] = sin_encode(
        data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
    )
    x["date_weekday_cos"] = cos_encode(
        data["QUOTE_DATETIME"].dt.dayofweek, days_in_week
    )

    seconds_in_day = 24 * 60 * 60
    seconds = (
        data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    ).dt.total_seconds()

    x["date_time_sin"] = sin_encode(seconds, seconds_in_day)
    x["date_time_cos"] = cos_encode(seconds, seconds_in_day)

    # impute with zeros
    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    x.fillna(0, inplace=True)

    # standardize continous columns (w/o date features)
    # bin encode categorical features
    try:
        x[num_cols] = scaler.transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("transform (val + test)")
    except NotFittedError as e:
        x[num_cols] = scaler.fit_transform(x[num_cols])
        x["bin_option_type"] = oe_option_type.fit_transform(
            data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
        )
        x["bin_issue_type"] = oe_issue_type.fit_transform(
            data["issue_type"].astype(str).values.reshape(-1, 1)
        )
        x["bin_root"] = oe_root.fit_transform(
            data["ROOT"].astype(str).values.reshape(-1, 1)
        )
        print("fit_transform (train)")

    x["buy_sell"] = data["buy_sell"]
    return x


## Write to file

In [None]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet",
    engine="fastparquet",
)

In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

name = "ise_log_standardized"

output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/train_set_60.parquet"
)
print(output_path)
train = transform(train)
train.to_parquet(output_path)

del train
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/val_set_20.parquet"
)
print(output_path)
val = transform(val)
val.to_parquet(output_path)

del val
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/test_set_20.parquet"
)
print(output_path)
test = transform(test)
test.to_parquet(output_path)

gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet
fit_transform (train)
gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet
transform (val + test)
gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/test_set_20.parquet
transform (val + test)


In [None]:
test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TRADE_PRICE,9861576.0,-0.12492,1.054377,-1.267551,-0.954576,-0.4035726,0.413707,6.884178
bid_ask_size_ratio_ex,9861576.0,-0.051557,0.19024,-0.07849,-0.077669,-0.07533702,-0.071192,22.388001
rel_bid_size_ex,9861576.0,0.0596,1.307457,-0.105558,-0.099123,-0.0767048,0.025725,905.752842
rel_ask_size_ex,9861576.0,0.061663,1.240542,-0.108601,-0.099577,-0.07679313,0.042863,950.295506
depth_ex,9861576.0,-0.004279,0.216,-6.603437,-0.020569,0.005155498,0.022519,8.811897
prox_ex,9861576.0,0.011459,0.617637,-55.140693,-0.29164,0.02707271,0.260716,42.14333
prox_best,9861576.0,0.036721,0.924123,-117.899155,-0.245383,0.02392271,0.302207,497.804916
spread_ex,9861576.0,0.755049,2.50318,-292.713818,-0.371467,-0.1390803,0.627795,903.263839
spread_best,9861576.0,0.000607,0.00562,-0.252352,-0.001908,-0.001365446,0.000571,0.205466
bid_ask_ratio_ex,9861576.0,-0.423227,1.233198,-3.523268,-0.733201,0.1120943,0.43146,5.893207


In [None]:
test.head().T

Unnamed: 0,39342171,39342172,39342173,39342174,39342175
TRADE_PRICE,-0.826642,1.069965,2.386463,-0.006925,-0.206911
bid_ask_size_ratio_ex,-0.069235,-0.075206,-0.075206,-0.075206,-0.075206
rel_bid_size_ex,-0.084384,0.222651,0.222651,-0.072737,-0.072737
rel_ask_size_ex,-0.050769,0.209473,0.209473,-0.076793,-0.076793
depth_ex,0.018661,0.005799,0.005799,0.005799,0.005799
prox_ex,-1.111186,-0.200578,-1.048144,1.165332,-1.111186
prox_best,-1.019643,-0.18479,-0.961846,1.763199,-1.019643
spread_ex,-0.301751,0.790465,7.064904,0.441886,0.093306
spread_best,-0.00152,0.00212,0.023036,0.000184,-0.000203
bid_ask_ratio_ex,0.103818,0.379402,0.237446,-0.082186,0.125281


In [8]:
name = "ise_log_standardized"
dataset = wandb.Artifact(name=name, type="preprocessed_data")
dataset.add_reference("gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet")
dataset.add_reference("gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet")
dataset.add_reference("gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/test_set_20.parquet")
run.log_artifact(dataset)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f3bdc88c460>

In [9]:
name = "ise_standardized"
dataset = wandb.Artifact(name=name, type="preprocessed_data")
dataset.add_reference("gs://thesis-bucket-option-trade-classification/data/ise_standardized/train_set_60.parquet")
dataset.add_reference("gs://thesis-bucket-option-trade-classification/data/ise_standardized/val_set_20.parquet")
dataset.add_reference("gs://thesis-bucket-option-trade-classification/data/ise_standardized/test_set_20.parquet")
run.log_artifact(dataset)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f3bdd718250>

In [11]:
run.finish()