Run `pip install -r requirements.txt` first to install all dependencies.

In [1]:
!pip install gcsfs==2022.10.0
!pip install wandb==0.13.4
!pip install fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import gcsfs

import numpy as np
import pandas as pd

import wandb

from typing import List, Optional

import google.auth
from google.colab import auth

In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

dataset = wandb.Artifact(name="train_val_test_w_trade_size", description="train, val, and test set w/o imputation, w/o log, w binning, and additional trade size features.", type="preprocessed_data")

[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)

In [5]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
]

In [6]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet", columns=columns
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet", columns=columns
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet", columns=columns
)


In [7]:
# unify for common preprocessing
X = pd.concat([train, val, test])

In [8]:
# isolate target
y = X[["buy_sell"]]
X = X.drop(["buy_sell"], axis=1)

In [9]:
# option features
X["time_to_maturity"] = (X["EXPIRATION"] - X["QUOTE_DATETIME"]).dt.days

In [10]:
# Trade features
mid = 0.5 * (X["ask_ex"] + X["bid_ex"])
X["midpoint_ex"] = mid
X['dis_mid_ex'] = X['TRADE_PRICE'] - mid

In [11]:
# Trade size features
X['rel_bid_size_ex'] = X['TRADE_SIZE'] / X['bid_size_ex']
X['rel_ask_size_ex'] = X['TRADE_SIZE'] / X['ask_size_ex']
X['diff_ask_bid_size_ex'] = X['bid_size_ex'] - X['ask_size_ex']

In [12]:
# apply positional encoding to dates
X["date_month_sin"] = np.sin(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)
X["date_month_cos"] = np.cos(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)

seconds_in_day = 24 * 60 * 60

seconds = (X["QUOTE_DATETIME"] - X["QUOTE_DATETIME"].dt.normalize()).dt.total_seconds()

X["date_time_sin"] = np.sin(2 * np.pi * seconds / seconds_in_day)
X["date_time_cos"] = np.cos(2 * np.pi * seconds / seconds_in_day)

# add year
X["date_year"] = (X["QUOTE_DATETIME"].dt.year - 2005) / (2017 - 2005)

In [13]:
feature_set_date = [
    "date_month_sin",
    "date_month_cos",
    "date_time_sin",
    "date_time_cos",
    "date_year",
]
feature_set_option = [
    "STRK_PRC",
    "ROOT",
    "time_to_maturity",
    "OPTION_TYPE",
]

feature_set_trade = [
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "bid_ex",
    "ask_ex",
    "bid_size_ex",
    "ask_size_ex",    
    "midpoint_ex",
    "dis_mid_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "diff_ask_bid_size_ex"
]

feature_set = [*feature_set_trade, *feature_set_date, *feature_set_option]
ignored_features = [x for x in X.columns.tolist() if x not in feature_set]
X.drop(columns=ignored_features, inplace=True)


In [14]:
# log transform
log_columns = [
    "TRADE_PRICE",
    "STRK_PRC",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "bid_ex",
    "ask_ex",
    "midpoint_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
]

# + 1 to prevent inf
# X[log_columns] = np.log(X[log_columns] + 1)


In [15]:
# binarize

# select categorical e. g., option type and strings e. g., ticker
cat_columns = X.select_dtypes(include=["category", "object"]).columns.tolist()
print(cat_columns)

# binarize categorical similar to Borisov et al.
X[cat_columns] = X[cat_columns].apply(lambda x: pd.factorize(x)[0])


['ROOT', 'OPTION_TYPE']


In [16]:
# treat inf as nan useful for log
# X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [17]:
X["buy_sell"] = y

In [18]:
# separate again for training scaling
X_train = X.loc[train.index, :]
X_val = X.loc[val.index, :]
X_test = X.loc[test.index, :]

In [19]:
X_test.head().T

Unnamed: 0,39342171,39342172,39342173,39342174,39342175
ROOT,4728.0,8258.0,8258.0,6110.0,6110.0
STRK_PRC,22.0,230.0,260.0,115.0,117.0
OPTION_TYPE,1.0,0.0,0.0,0.0,0.0
TRADE_SIZE,2.0,1.0,1.0,1.0,1.0
TRADE_PRICE,0.52,7.82,28.889999,2.25,1.7
BEST_BID,0.52,7.6,28.799999,1.85,1.7
BEST_ASK,0.6,8.15,32.049999,2.15,1.95
ask_ex,0.6,8.15,32.049999,2.25,1.95
bid_ex,0.52,7.6,28.799999,1.85,1.7
bid_size_ex,31.0,1.0,1.0,10.0,10.0


In [20]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/train_set_60.parquet"
X_train.to_parquet(output_path)
dataset.add_reference(output_path)

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/train_set_60.parquet/train_set_60.parquet>]

In [21]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/val_set_20.parquet"
X_val.to_parquet(output_path)
dataset.add_reference(output_path)

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/val_set_20.parquet/val_set_20.parquet>]

In [22]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/test_set_20.parquet"
X_test.to_parquet(output_path)
dataset.add_reference(output_path)

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/test_set_20.parquet/test_set_20.parquet>]

In [23]:
run.log_artifact(dataset)
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…