Run `pip install .` first to install all dependencies.

In [1]:
!pip install gcsfs==2022.10.0
!pip install wandb==0.13.4
!pip install fastparquet
!pip install numpy
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gcsfs==2022.10.0
  Downloading gcsfs-2022.10.0-py2.py3-none-any.whl (25 kB)
Collecting fsspec==2022.10.0
  Downloading fsspec-2022.10.0-py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 6.5 MB/s 
Installing collected packages: fsspec, gcsfs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2022.11.0
    Uninstalling fsspec-2022.11.0:
      Successfully uninstalled fsspec-2022.11.0
Successfully installed fsspec-2022.10.0 gcsfs-2022.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb==0.13.4
  Downloading wandb-0.13.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.2 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.11

In [5]:
import gcsfs
import numpy as np
import pandas as pd

import wandb

from sklearn.preprocessing import MinMaxScaler


In [3]:
name = "classical_size_features_normalized"

# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

dataset = wandb.Artifact(name=name, type="preprocessed_data")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
import google.auth
from google.colab import auth
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)

In [7]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
]


In [8]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)


In [9]:
# oe_option_type = OrdinalEncoder(
#     unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
# )
# oe_root = OrdinalEncoder(
#     unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
# )
# oe_issue_type = OrdinalEncoder(
#     unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
# )

min_max_scaler = MinMaxScaler(feature_range=[-1, 1])


def transform(data: pd.DataFrame) -> pd.DataFrame:

    # # date features
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # x["date_month_sin"] = np.sin(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)
    # x["date_month_cos"] = np.cos(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)

    # seconds_in_day = 24 * 60 * 60
    # seconds = (
    #     data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    # ).dt.total_seconds()

    # x["date_time_sin"] = np.sin(2 * np.pi * seconds / seconds_in_day)
    # x["date_time_cos"] = np.cos(2 * np.pi * seconds / seconds_in_day)

    # option features
    # x["ttm"] = (
    #     data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    # ).apply(lambda x: x.n)
    # x[["myn", "day_vol"]] = data[["myn", "day_vol"]]
    # x["log_strk_prc"] = np.log1p(data["STRK_PRC"])

    # binarize
    # "bin_OPTION_TYPE", "bin_issue_type", "bin_ROOT",

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x[["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]] = data[
        ["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]
    ]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])
    x["rel_ask_ex"] = (data["TRADE_PRICE"] - mid_ex) / (data["ask_ex"] - mid_ex)
    x["rel_bid_ex"] = (mid_ex - data["TRADE_PRICE"]) / (mid_ex - data["bid_ex"])
    x["BEST_rel_bid"] = (data["TRADE_PRICE"] - mid_best) / (data["BEST_ASK"] - mid_best)
    x["BEST_rel_ask"] = (mid_best - data["TRADE_PRICE"]) / (mid_best - data["BEST_BID"])
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]

    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # x[
    #     [
    #         "log_ask_ex",
    #         "log_bid_ex",
    #         "log_BEST_ASK",
    #         "log_BEST_BID",
    #         "log_trade_price",
    #         "log_price_all_lag",
    #         "log_price_all_lead",
    #         "log_price_ex_lag",
    #         "log_price_ex_lead",
    #     ]
    # ] = np.log1p(
    #     data[
    #         [
    #             "ask_ex",
    #             "bid_ex",
    #             "BEST_ASK",
    #             "BEST_BID",
    #             "TRADE_PRICE",
    #             "price_all_lag",
    #             "price_all_lead",
    #             "price_ex_lag",
    #             "price_ex_lead",
    #         ]
    #     ]
    # )

    x[
        [
            "ask_ex",
            "bid_ex",
            "BEST_ASK",
            "BEST_BID",
            "TRADE_PRICE",
            "price_all_lag",
            "price_all_lead",
            "price_ex_lag",
            "price_ex_lead",
        ]
    ] = data[
        [
            "ask_ex",
            "bid_ex",
            "BEST_ASK",
            "BEST_BID",
            "TRADE_PRICE",
            "price_all_lag",
            "price_all_lead",
            "price_ex_lag",
            "price_ex_lead",
        ]
    ]

    # impute with zeros
    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    x.fillna(0, inplace=True)

    # scale to [-1, 1]
    if not hasattr(min_max_scaler, "n_features_in_"):
        min_max_scaler.fit(x)
    x[x.columns] = min_max_scaler.transform(x)

    # https://stackoverflow.com/questions/70727291/how-do-i-know-whether-a-sklearn-scaler-is-already-fitted-or-not

    # if not hasattr(oe_option_type, "n_features_in_"):
    #     oe_option_type.fit(data["OPTION_TYPE"].astype(str).values.reshape(-1, 1))
    # x["bin_option_type"] = oe_option_type.transform(
    #     data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
    # )

    # if not hasattr(oe_root, "n_features_in_"):
    #     oe_root.fit(data["ROOT"].astype(str).values.reshape(-1, 1))
    # x["bin_root"] = oe_root.transform(data["ROOT"].astype(str).values.reshape(-1, 1))

    # if not hasattr(oe_issue_type, "n_features_in_"):
    #     oe_issue_type.fit(data["issue_type"].astype(str).values.reshape(-1, 1))
    # x["bin_issue_type"] = oe_issue_type.transform(
    #     data["issue_type"].astype(str).values.reshape(-1, 1)
    # )

    # x.replace([np.inf, -np.inf], np.nan, inplace=True)

    x["buy_sell"] = data["buy_sell"]
    return x


In [10]:
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/train_set_60.parquet"
)
train = transform(train)
train.to_parquet(output_path)
dataset.add_reference(output_path)

del train

In [11]:
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/val_set_20.parquet"
)
val = transform(val)
val.to_parquet(output_path)
dataset.add_reference(output_path)

del val

In [12]:
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/test_set_20.parquet"
)
test = transform(test)
test.to_parquet(output_path)
dataset.add_reference(output_path)


[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/classical_size_features_normalized/test_set_20.parquet/test_set_20.parquet>]

In [13]:
test.describe()

Unnamed: 0,TRADE_PRICE,bid_ask_size_ratio_ex,rel_bid_size_ex,rel_ask_size_ex,TRADE_SIZE,bid_size_ex,ask_size_ex,rel_ask_ex,rel_bid_ex,BEST_rel_bid,...,chg_all_lag,ask_ex,bid_ex,BEST_ASK,BEST_BID,price_all_lag,price_all_lead,price_ex_lag,price_ex_lead,buy_sell
count,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,...,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0
mean,-0.9923103,-0.999836,-0.999829,-0.9998783,-0.9996231,-0.997836,-0.9975226,-0.4496103,0.4496103,-0.2217289,...,0.8878542,-0.9919251,-0.9926397,-0.9999457,-0.9925228,-0.9989841,-0.9998989,-0.9931,-0.9946407,-0.02805495
std,0.02859741,0.001158564,0.001354047,0.0008864031,0.002769686,0.005458831,0.005723609,0.0006287551,0.000628755,0.0007700371,...,0.0005424119,0.02890719,0.02823542,0.0001972331,0.0283823,0.003795401,0.0003817389,0.02662669,0.02121498,0.9996064
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.505756,0.40672,-0.3200005,...,0.7040394,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-0.9995017,-0.999995,-0.9999933,-0.9999936,-1.0,-0.9998,-0.99976,-0.4499189,0.4493566,-0.221964,...,0.887843,-0.9993581,-0.999632,-0.999996,-0.9995732,-0.999936,-0.999994,-0.9997216,-0.999839,-1.0
50%,-0.9981828,-0.9999808,-0.9999701,-0.9999773,-0.9999273,-0.9994,-0.99936,-0.4495944,0.4495944,-0.2217396,...,0.8878506,-0.9979431,-0.9983809,-0.9999866,-0.9983075,-0.99976,-0.9999768,-0.9985199,-0.9989419,-1.0
75%,-0.9944458,-0.9999556,-0.999864,-0.9998918,-0.9996727,-0.99826,-0.99792,-0.4493566,0.4499189,-0.2215077,...,0.88786,-0.9938585,-0.9948484,-0.99996,-0.9947016,-0.9992679,-0.9999276,-0.9951639,-0.9963082,1.0
max,1.832569,-0.8631786,-0.06186268,-0.3209091,0.7575229,-0.7259173,-0.7940379,-0.4067197,0.5057546,0.1930428,...,0.9936979,1.834282,1.829555,-0.9805971,1.83391,-0.6961096,-0.9703997,1.226488,1.002507,1.0


In [14]:
run.log_artifact(dataset)
run.finish()
