<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/notes/notebooks/3.0b-feature-engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run `pip install -r requirements.txt` first to install all dependencies.

In [None]:
!pip install gcsfs==2022.10.0
!pip install wandb==0.13.4

In [None]:
import numpy as np
import pandas as pd

import wandb

from typing import List, Optional

import google.auth
from google.colab import auth

In [None]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

dataset = wandb.Artifact(name="train_val_test", type="fe_log_bin_data")

In [None]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet",
    engine="fastparquet",
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",
)


In [None]:
# unify for common preprocessing
X = pd.concat([train, val, test])

In [None]:
# isolate target
y = X[["buy_sell"]]
X = X.drop(["buy_sell"], axis=1)

In [None]:
# option features
X["time_to_maturity"] = (X["EXPIRATION"] - X["QUOTE_DATETIME"]).dt.days

In [None]:
# Trade features
mid = 0.5 * (X["ask_ex"] + X["bid_ex"])
X["midpoint_ex"] = mid

In [None]:
# apply positional encoding to dates
X["date_month_sin"] = np.sin(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)
X["date_month_cos"] = np.cos(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)

X["date_month_sin"] = np.sin(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)
X["date_month_cos"] = np.cos(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)

seconds_in_day = 24 * 60 * 60

seconds = (X["QUOTE_DATETIME"] - X["QUOTE_DATETIME"].dt.normalize()).dt.total_seconds()

X["date_time_sin"] = np.sin(2 * np.pi * seconds / seconds_in_day)
X["date_time_cos"] = np.cos(2 * np.pi * seconds / seconds_in_day)

# add year
X["date_year"] = (X["QUOTE_DATETIME"].dt.year - 2005) / (2017 - 2005)

In [None]:
feature_set_date = [
    "date_month_sin",
    "date_month_cos",
    "date_time_sin",
    "date_time_cos",
    "date_year",
]
feature_set_option = [
    "STRK_PRC",
    "ROOT",
    "time_to_maturity",
    "OPTION_TYPE",
]

feature_set_trade = [
    "TRADE_SIZE",
    "TRADE_PRICE",
    "price_ex_lag",
    "price_ex_lead",
    "bid_ex",
    "ask_ex",
    "bid_size_ex",
    "ask_size_ex",    
    "midpoint_ex",
]

feature_set = [*feature_set_trade, *feature_set_date, *feature_set_option]
ignored_features = [x for x in X.columns.tolist() if x not in feature_set]
X.drop(columns=ignored_features, inplace=True)


In [None]:
# log transform
log_columns = [
    "TRADE_PRICE",
    "STRK_PRC",
    "price_ex_lag",
    "price_ex_lead",
    "bid_ex",
    "ask_ex",
    "midpoint_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
]

# + 1 to prevent inf
X[log_columns] = np.log(X[log_columns] + 1)


In [None]:
# binarize

# select categorical e. g., option type and strings e. g., ticker
cat_columns = X.select_dtypes(include=["category", "object"]).columns.tolist()
print(cat_columns)

# binarize categorical similar to Borisov et al.
X[cat_columns] = X[cat_columns].apply(lambda x: pd.factorize(x)[0])


In [None]:
# treat inf as nan
X.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
X["buy_sell"] = y

In [None]:
# separate again for training scaling
X_train = X.loc[train.index, :]
X_val = X.loc[val.index, :]
X_test = X.loc[test.index, :]

y_train = y.loc[train.index, :]
y_val = y.loc[val.index, :]
y_test = y.loc[test.index, :]

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/fe_log_bin_data/train_set_60.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path,name='train_set_60')

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/fe_log_bin_data/val_set_20.parquet"
val.to_parquet(output_path)
dataset.add_reference(output_path,name='val_set_20')

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/fe_log_bin_data/test_set_20.parquet"
test.to_parquet(output_path)
dataset.add_reference(output_path,name='test_set_20')