Run on runpods.io due to memory requirements. ⚠️

In [None]:
import os
import random

from catboost import CatBoostClassifier


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_validate

from typing import List, Optional, Tuple


In [None]:
# set fixed seed
def seed_everything(seed) -> None:
    """
    Seeds basic parameters for reproducibility of results.
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    # pandas and numpy as discussed here: https://stackoverflow.com/a/52375474/5755604
    np.random.seed(seed)


seed = 42
seed_everything(seed)


# Cross-validation⛑️

In [None]:
oe_option_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_root = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_issue_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)


In [None]:
def transform(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:

    # date features
    x = pd.DataFrame(
        data={"date_year": data["QUOTE_DATETIME"].dt.year}, index=data.index
    )

    x["date_month_sin"] = np.sin(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)
    x["date_month_cos"] = np.cos(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)

    seconds_in_day = 24 * 60 * 60
    seconds = (
        data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    ).dt.total_seconds()

    x["date_time_sin"] = np.sin(2 * np.pi * seconds / seconds_in_day)
    x["date_time_cos"] = np.cos(2 * np.pi * seconds / seconds_in_day)

    # option features
    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)
    x[["myn", "day_vol"]] = data[["myn", "day_vol"]]
    # FIXME: make consistent later
    x["log_strk_prc"] = np.log1p(data["STRK_PRC"])

    # binarize
    # "bin_OPTION_TYPE", "bin_issue_type", "bin_ROOT",

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x[["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]] = data[
        ["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]
    ]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])
    x["rel_ask_ex"] = (data["TRADE_PRICE"] - mid_ex) / (data["ask_ex"] - mid_ex)
    x["rel_bid_ex"] = (mid_ex - data["TRADE_PRICE"]) / (mid_ex - data["bid_ex"])
    x["BEST_rel_bid"] = (data["TRADE_PRICE"] - mid_best) / (data["BEST_ASK"] - mid_best)
    x["BEST_rel_ask"] = (mid_best - data["TRADE_PRICE"]) / (mid_best - data["BEST_BID"])
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]

    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # log(x + 0.01)
    x[
        [
            "log001p_ask_ex",
            "log001p_bid_ex",
            "log001p_BEST_ASK",
            "log001p_BEST_BID",
            "log001p_trade_price",
            "log001p_price_all_lag",
            "log001p_price_all_lead",
            "log001p_price_ex_lag",
            "log001p_price_ex_lead",
            "log001p_TRADE_SIZE",
            "log001p_bid_size_ex",
            "log001p_ask_size_ex",
        ]
    ] = np.log(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
        ]
        + 1e-2
    )

    # log(x + 1)
    x[
        [
            "log1p_ask_ex",
            "log1p_bid_ex",
            "log1p_BEST_ASK",
            "log1p_BEST_BID",
            "log1p_trade_price",
            "log1p_price_all_lag",
            "log1p_price_all_lead",
            "log1p_price_ex_lag",
            "log1p_price_ex_lead",
            "log1p_TRADE_SIZE",
            "log1p_bid_size_ex",
            "log1p_ask_size_ex",
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
        ]
    )

    # https://stackoverflow.com/questions/70727291/how-do-i-know-whether-a-sklearn-scaler-is-already-fitted-or-not

    if not hasattr(oe_option_type, "n_features_in_"):
        oe_option_type.fit(data["OPTION_TYPE"].astype(str).values.reshape(-1, 1))
    x["bin_option_type"] = oe_option_type.transform(
        data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
    )

    if not hasattr(oe_root, "n_features_in_"):
        oe_root.fit(data["ROOT"].astype(str).values.reshape(-1, 1))
    x["bin_root"] = oe_root.transform(data["ROOT"].astype(str).values.reshape(-1, 1))

    if not hasattr(oe_issue_type, "n_features_in_"):
        oe_issue_type.fit(data["issue_type"].astype(str).values.reshape(-1, 1))
    x["bin_issue_type"] = oe_issue_type.transform(
        data["issue_type"].astype(str).values.reshape(-1, 1)
    )

    x.replace([np.inf, -np.inf], np.nan, inplace=True)

    y = data["buy_sell"]
    return x, y


In [None]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet"
)
x_train, y_train = transform(train)

del train


In [None]:
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet"
)
x_val, y_val = transform(val)

del val


In [None]:
test = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet"
)
x_test, y_test = transform(test)

del test


In [None]:
classical_features_log001p = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "log001p_ask_ex",
    "log001p_bid_ex",
    "log001p_BEST_ASK",
    "log001p_BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "log001p_trade_price",
    "log001p_price_all_lag",
    "log001p_price_all_lead",
    "log001p_price_ex_lag",
    "log001p_price_ex_lead",
]

classical_features_log1p = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "log1p_ask_ex",
    "log1p_bid_ex",
    "log1p_BEST_ASK",
    "log1p_BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "log1p_trade_price",
    "log1p_price_all_lag",
    "log1p_price_all_lead",
    "log1p_price_ex_lag",
    "log1p_price_ex_lead",
]

classical_features_no_transform = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "ask_ex",
    "bid_ex",
    "BEST_ASK",
    "BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "trade_price",
    "price_all_lag",
    "price_all_lead",
    "price_ex_lag",
    "price_ex_lead",
]


size_features_no_transform = [
    "TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "bid_size_ex",
    "ask_size_ex",
]

size_features_log001p= [
    "log001p_TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "log001p_bid_size_ex",
    "log001p_ask_size_ex",
]


size_features_log1p= [
    "log1p_TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "log1p_bid_size_ex",
    "log1p_ask_size_ex",
]


option_features = [
    "bin_option_type",
    "bin_issue_type",
    "bin_root",
    "myn",
    "log_strk_prc",
    "ttm",
    "day_vol",
]
date_features = [
    "date_time_cos",
    "date_time_sin",
    "date_month_cos",
    "date_month_sin",
    "date_year",
]

cat_features = ["bin_root", "bin_issue_type", "bin_option_type"]


In [None]:
def evaluate(
    features: List[str], cat_features: Optional[List[str]]
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    params = {
        "od_type": "Iter",
        "logging_level": "Silent",
        "depth": 8,
        "loss_function": "Logloss",
        "task_type": "GPU",
        "cat_features": cat_features,
        "random_seed": 42,
    }

    res = cross_validate(
        CatBoostClassifier(**params),
        x_train[features],
        y_train,
        cv=3,
        return_estimator=True,
    )
    results_cv = pd.DataFrame(res["test_score"], columns=["test_score_cv"])
    print(res)

    oos = []
    feature_importances = []
    for i, model in enumerate(res["estimator"]):

        oos.append(
            [
                i,
                model.score(x_train[features], y_train),
                model.score(x_val[features], y_val),
                model.score(x_test[features], y_test),
            ]
        )

        feature_importance = model.get_feature_importance(prettified=True).add_prefix(
            f"fold_{i}_"
        )
        feature_importances.append(feature_importance)

    results_oos = pd.DataFrame(
        data=oos, columns=["fold", "acc_train", "acc_val", "acc_test"]
    )
    results_fi = pd.concat(feature_importances, axis=1)

    return results_cv, results_oos, results_fi


In [None]:
results_cv, results_oos, results_fi = evaluate(classical_features_no_transform, [])


In [None]:
results_cv


In [None]:
results_oos


In [None]:
results_fi


In [None]:
results_cv, results_oos, results_fi = evaluate(classical_features_log001p, [])

In [None]:
results_cv

In [None]:
results_oos

In [None]:
results_fi

In [None]:
results_cv, results_oos, results_fi = evaluate(classical_features_log1p, [])

In [None]:
results_cv

In [None]:
results_oos

In [None]:
results_fi

In [None]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_no_transform], []
)


In [None]:
results_cv


In [None]:
results_oos


In [None]:
results_fi


In [None]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_log001p], []
)

In [None]:
results_cv

In [None]:
results_oos

In [None]:
results_fi

In [None]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_log1p], []
)

In [None]:
results_cv

In [None]:
results_oos

In [None]:
results_fi

In [None]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features, *size_features, *date_features, *option_features],
    cat_features,
)


In [None]:
results_cv


In [None]:
results_oos


In [None]:
results_fi
