In [None]:
import os
import sys

import gcsfs
import numpy as np
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from numpy.testing import assert_almost_equal
from pandas._testing.asserters import assert_almost_equal
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from tqdm.auto import tqdm

sys.path.append("..")


In [None]:
EXCHANGE = "cboe"  # "ise"
STRATEGY = "transfer"  # "supervised"
max_i = 50 if EXCHANGE == "ise" else 38  # number of partial files


In [None]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name=f"{EXCHANGE}_{STRATEGY}_raw", type="preprocessed_data")


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

fs = gcsfs.GCSFileSystem(project="thesis")


In [None]:
files = [
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'unmatched' if STRATEGY == 'unsupervised' else 'matched'}_{EXCHANGE}_quotes_min_mem_usage_extended_part_{i:04d}.parquet"
    for i in range(0, max_i)
]

# asks = [f"ASK_{i}" for i in range(1, 17)]
# bids = [f"BID_{i}" for i in range(1, 17)]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    "day_vol",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
    # *asks,
    # *bids,
    "buy_sell",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]
df = pd.concat(dfs)

del dfs


In [None]:
df.memory_usage(deep=True).sum()


In [None]:
df.head().T


In [None]:
len(df)


In [None]:
# check against some stats from sub panel A.1 in Grauer et al

# trade size
stats_trade_size = df["TRADE_SIZE"].agg(["mean", "median", "std"])

# time to maturity
stats_time_to_maturity = (df["EXPIRATION"] - df["QUOTE_DATETIME"]).dt.days
stats_time_to_maturity = stats_time_to_maturity.agg(["mean", "median", "std"])

# no of observations
stats_n = len(df)

# trade_size = quote size; TRADE_SIZE
stats_trades_with_quote_size_bid = df["bid_size_ex"].eq(df["TRADE_SIZE"])
stats_trades_with_quote_size_ask = df["ask_size_ex"].eq(df["TRADE_SIZE"])

# ask or bid
stats_trade_with_quote_size = (
    stats_trades_with_quote_size_bid | stats_trades_with_quote_size_ask
).sum() / stats_n


# no of buys
stats_buy_trades = df["buy_sell"].ge(0).sum() / stats_n


if EXCHANGE == "ise" and (STRATEGY == "supervised" or STRATEGY == "transfer"):
    assert stats_n == 49203747
    assert_almost_equal(stats_trade_with_quote_size, 0.2281, atol=0.01)
    assert_almost_equal(stats_trade_size.values.tolist(), [13.62, 4.0, 77.75], atol=0.1)
    assert_almost_equal(stats_buy_trades, 0.4746, atol=0.01)

if EXCHANGE == "cboe" and (STRATEGY == "supervised" or STRATEGY == "transfer"):
    assert stats_n == 37155412
    assert_almost_equal(stats_trade_with_quote_size, 0.1397, atol=0.01)
    assert_almost_equal(
        stats_trade_size.values.tolist(), [18.14, 5.0, 223.24], atol=0.1
    )
    assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)


## train-test-split ⚗️

In [None]:
df.sort_values(by="QUOTE_DATETIME", inplace=True)


In [None]:
# indices

if EXCHANGE == "ise" and STRATEGY == "supervised":
    train_range = df.QUOTE_DATETIME.between(
        "2005-05-02 00:00:01", "2013-10-24 23:59:00"
    )
    val_range = df.QUOTE_DATETIME.between("2013-10-25 00:00:01", "2015-11-05 23:59:00")
    test_range = df.QUOTE_DATETIME.between("2015-11-06 00:00:01", "2017-05-31 23:59:00")

if EXCHANGE == "cboe" and STRATEGY == "supervised":
    train_range = df.QUOTE_DATETIME.between(
        "2011-01-01 00:00:01", "2015-06-15 23:59:00"
    )
    val_range = df.QUOTE_DATETIME.between("2015-06-16 00:00:01", "2016-10-12 23:59:00")
    test_range = df.QUOTE_DATETIME.between("2016-10-13 00:00:01", "2017-10-31 23:59:00")


if EXCHANGE == "cboe" and STRATEGY == "transfer":
    # use everything after *ISE* validation set for transfer learning
    test_range = df.QUOTE_DATETIME.between("2015-11-06 00:00:01", "2017-10-31 23:59:00")


In [None]:
if STRATEGY == "supervised":

    train = df[train_range]

    len_train = len(train)
    print(f"train ratio: {len_train / len(df)}")

    val = df[val_range]
    len_val = len(val)
    print(f"val ratio: {len_val / len(df)}")

    test = df[test_range]
    len_test = len(test)
    print(f"test ratio: {len_test / len(df)}")

    # check if total is sum of its parts
    assert len_train + len_val + len_test == len(df)

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_train.parquet"
    train.to_parquet(output_path)
    dataset.add_reference(output_path, name="train_set")

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_val.parquet"
    val.to_parquet(output_path)
    dataset.add_reference(output_path, name="val_set")

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_test.parquet"
    test.to_parquet(output_path)
    dataset.add_reference(output_path, name="test_set")


elif STRATEGY == "transfer":
    test = df[test_range]
    len_test = len(test)
    print(f"test ratio: {len_test / len(df)}")

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_test.parquet"
    test.to_parquet(output_path)
    dataset.add_reference(output_path, name="test_set")


In [None]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)

wandb.finish()


## Relevant length of dataset⏲️

In [None]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set.parquet",
    engine="fastparquet",
)
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set.parquet",
    engine="fastparquet",
)


In [None]:
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet",
    engine="fastparquet",
)


In [None]:
y_train = train["buy_sell"]
X_train = train.drop(columns=["buy_sell"])


In [None]:
y_val = val["buy_sell"]
X_val = val.drop(columns=["buy_sell"])


In [None]:
X_train.head()


In [None]:
results_p = []
percentages = np.linspace(0.1, 1, 10)


# go back from last element
for p in tqdm(percentages):
    # keep ordering of data
    length = int(len(y_train) * p)
    timestamp = np.linspace(0, 1, length)
    # keep weight fixed
    for strategy in ["uniform", "exponential"]:

        if strategy == "uniform":
            weight = np.ones(length)
        else:
            weight = np.geomspace(0.001, 1, num=len(y_train))[-length:]

        train_pool = Pool(
            data=X_train.iloc[-length:],
            label=y_train.iloc[-length:],
            # cat_features=cat_features,
            weight=weight,
            timestamp=np.linspace(0, 1, length),
        )
        val_pool = Pool(data=X_val.iloc[-length:], label=y_val.iloc[-length:])

        kwargs_cat = {
            # "iterations": 1000,
            # "grow_policy": "symmetric",
            # "border_count": 254,
            "logging_level": "Silent",
            "task_type": "GPU",
            "random_seed": 42,
            "eval_metric": "Accuracy",
            # "early_stopping_rounds": 100,
        }

        clf = CatBoostClassifier(**kwargs_cat)
        clf.fit(
            train_pool,
            eval_set=val_pool,
        )

        train_acc = clf.score(train_pool)
        val_acc = clf.score(val_pool)

        res = {
            "start": -length,
            "end": -1,
            "train_acc": train_acc,
            "val_acc": val_acc,
            "strategy": strategy,
        }
        print(res)
        results_p.append(res)


In [None]:
results_df = pd.DataFrame(results_p)


In [None]:
results_df


In [None]:
results_df.to_csv("learning_curves_gbm_default_params.csv")


## Time consistency

Check if features maintain their predictive power over time, buy training on the first $10~\%$ of the training set and predicting the last $10~\%$ feature by feature. Remove features or further investigate features where accuracy is just above or below $0.5$. Technique found in this [notebook](https://www.kaggle.com/code/cdeotte/xgb-fraud-with-magic-0-9600/notebook).

In [None]:
data = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/classical_size_features_log_normalized/train_set_extended_60.parquet",
    engine="fastparquet",
)


In [None]:
# try to predict last 10 % in training set using first 10 % of features. Accuracy should be above 50 %.
label = data["buy_sell"]
data.drop(columns=["buy_sell"], inplace=True)


In [None]:
y_train = label.iloc[0 : len(label) // 10]
y_test = label.iloc[-len(data) // 10 :]

X_train = data.iloc[0 : len(data) // 10, :]
X_test = data.iloc[-len(data) // 10 :, :]

del label, data


In [None]:
y_train.shape


In [None]:
params = {
    "od_type": "Iter",
    "logging_level": "Silent",
    "loss_function": "Logloss",
    "task_type": "GPU",
    "cat_features": None,
    "random_seed": 42,
    "eval_metric": "Accuracy",
    "iterations": 1000,
    "early_stopping_rounds": 100,
}


In [None]:
columns = X_train.columns


In [None]:
results = []
for col in tqdm(columns):
    model = CatBoostClassifier(**params)
    model.fit(X_train[[col]], y_train, eval_set=(X_test[[col]], y_test))
    acc = model.score(X_test[[col]], y_test)
    results.append([col, acc])


In [None]:
results_df = pd.DataFrame(results, columns=["feature", "accuracy"])
results_df.sort_values(by="accuracy")


Few features are actually consistent over time and are more informative than a random guess. These include entire includes features related to the proximity of the quote and relative bid size.

Some features like `chg_ex_lead` are hard to exclude. Better weight observations, as suggested in `3.0c-feature-engineering.ipynb`.