Run on runpods.io due to memory requirements. ⚠️

In [1]:
import os
import random

from catboost import CatBoostClassifier


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_validate

from typing import List, Optional, Tuple


In [2]:
# set fixed seed
def seed_everything(seed) -> None:
    """
    Seeds basic parameters for reproducibility of results.
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    # pandas and numpy as discussed here: https://stackoverflow.com/a/52375474/5755604
    np.random.seed(seed)


seed = 42
seed_everything(seed)


# Cross-validation⛑️

In [11]:
oe_option_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_root = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_issue_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)


In [12]:
def transform(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:

    # date features
    x = pd.DataFrame(
        data={"date_year": data["QUOTE_DATETIME"].dt.year}, index=data.index
    )

    x["date_month_sin"] = np.sin(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)
    x["date_month_cos"] = np.cos(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)

    seconds_in_day = 24 * 60 * 60
    seconds = (
        data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    ).dt.total_seconds()

    x["date_time_sin"] = np.sin(2 * np.pi * seconds / seconds_in_day)
    x["date_time_cos"] = np.cos(2 * np.pi * seconds / seconds_in_day)

    # option features
    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)
    x[["myn", "day_vol"]] = data[["myn", "day_vol"]]
    # FIXME: make consistent later
    x["log_strk_prc"] = np.log1p(data["STRK_PRC"])

    # binarize
    # "bin_OPTION_TYPE", "bin_issue_type", "bin_ROOT",

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x[["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]] = data[
        ["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]
    ]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])
    x["rel_ask_ex"] = (data["TRADE_PRICE"] - mid_ex) / (data["ask_ex"] - mid_ex)
    x["rel_bid_ex"] = (mid_ex - data["TRADE_PRICE"]) / (mid_ex - data["bid_ex"])
    x["BEST_rel_bid"] = (data["TRADE_PRICE"] - mid_best) / (data["BEST_ASK"] - mid_best)
    x["BEST_rel_ask"] = (mid_best - data["TRADE_PRICE"]) / (mid_best - data["BEST_BID"])
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]

    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # no transform
    x[
        [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
        ]
    ] = data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
    ]


    # log(x + 0.01)
    x[
        [
            "log001p_ask_ex",
            "log001p_bid_ex",
            "log001p_BEST_ASK",
            "log001p_BEST_BID",
            "log001p_trade_price",
            "log001p_price_all_lag",
            "log001p_price_all_lead",
            "log001p_price_ex_lag",
            "log001p_price_ex_lead",
            "log001p_TRADE_SIZE",
            "log001p_bid_size_ex",
            "log001p_ask_size_ex",
        ]
    ] = np.log(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
        ]
        + 1e-2
    )

    # log(x + 1)
    x[
        [
            "log1p_ask_ex",
            "log1p_bid_ex",
            "log1p_BEST_ASK",
            "log1p_BEST_BID",
            "log1p_trade_price",
            "log1p_price_all_lag",
            "log1p_price_all_lead",
            "log1p_price_ex_lag",
            "log1p_price_ex_lead",
            "log1p_TRADE_SIZE",
            "log1p_bid_size_ex",
            "log1p_ask_size_ex",
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
        ]
    )

    # https://stackoverflow.com/questions/70727291/how-do-i-know-whether-a-sklearn-scaler-is-already-fitted-or-not

    if not hasattr(oe_option_type, "n_features_in_"):
        oe_option_type.fit(data["OPTION_TYPE"].astype(str).values.reshape(-1, 1))
    x["bin_option_type"] = oe_option_type.transform(
        data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
    )

    if not hasattr(oe_root, "n_features_in_"):
        oe_root.fit(data["ROOT"].astype(str).values.reshape(-1, 1))
    x["bin_root"] = oe_root.transform(data["ROOT"].astype(str).values.reshape(-1, 1))

    if not hasattr(oe_issue_type, "n_features_in_"):
        oe_issue_type.fit(data["issue_type"].astype(str).values.reshape(-1, 1))
    x["bin_issue_type"] = oe_issue_type.transform(
        data["issue_type"].astype(str).values.reshape(-1, 1)
    )

    x.replace([np.inf, -np.inf], np.nan, inplace=True)

    y = data["buy_sell"]
    return x, y


In [13]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet"
)
x_train, y_train = transform(train)

del train


In [14]:
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet"
)
x_val, y_val = transform(val)

del val


In [15]:
test = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet"
)
x_test, y_test = transform(test)

del test


In [19]:
classical_features_log001p = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "log001p_ask_ex",
    "log001p_bid_ex",
    "log001p_BEST_ASK",
    "log001p_BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "log001p_trade_price",
    "log001p_price_all_lag",
    "log001p_price_all_lead",
    "log001p_price_ex_lag",
    "log001p_price_ex_lead",
]

classical_features_log1p = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "log1p_ask_ex",
    "log1p_bid_ex",
    "log1p_BEST_ASK",
    "log1p_BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "log1p_trade_price",
    "log1p_price_all_lag",
    "log1p_price_all_lead",
    "log1p_price_ex_lag",
    "log1p_price_ex_lead",
]

classical_features_no_transform = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "ask_ex",
    "bid_ex",
    "BEST_ASK",
    "BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "TRADE_PRICE",
    "price_all_lag",
    "price_all_lead",
    "price_ex_lag",
    "price_ex_lead",
]


size_features_no_transform = [
    "TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "bid_size_ex",
    "ask_size_ex",
]

size_features_log001p= [
    "log001p_TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "log001p_bid_size_ex",
    "log001p_ask_size_ex",
]


size_features_log1p= [
    "log1p_TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "log1p_bid_size_ex",
    "log1p_ask_size_ex",
]


option_features = [
    "bin_option_type",
    "bin_issue_type",
    "bin_root",
    "myn",
    "log_strk_prc",
    "ttm",
    "day_vol",
]
date_features = [
    "date_time_cos",
    "date_time_sin",
    "date_month_cos",
    "date_month_sin",
    "date_year",
]

cat_features = ["bin_root", "bin_issue_type", "bin_option_type"]


In [20]:
def evaluate(
    features: List[str], cat_features: Optional[List[str]]
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    params = {
        "od_type": "Iter",
        "logging_level": "Silent",
        "depth": 8,
        "loss_function": "Logloss",
        "task_type": "GPU",
        "cat_features": cat_features,
        "random_seed": 42,
    }

    res = cross_validate(
        CatBoostClassifier(**params),
        x_train[features],
        y_train,
        cv=3,
        return_estimator=True,
    )
    results_cv = pd.DataFrame(res["test_score"], columns=["test_score_cv"])
    print(res)

    oos = []
    feature_importances = []
    for i, model in enumerate(res["estimator"]):

        oos.append(
            [
                i,
                model.score(x_train[features], y_train),
                model.score(x_val[features], y_val),
                model.score(x_test[features], y_test),
            ]
        )

        feature_importance = model.get_feature_importance(prettified=True).add_prefix(
            f"fold_{i}_"
        )
        feature_importances.append(feature_importance)

    results_oos = pd.DataFrame(
        data=oos, columns=["fold", "acc_train", "acc_val", "acc_test"]
    )
    results_fi = pd.concat(feature_importances, axis=1)

    return results_cv, results_oos, results_fi


In [21]:
results_cv, results_oos, results_fi = evaluate(classical_features_no_transform, [])


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([103.63985729, 100.6778717 ,  99.92939949]), 'score_time': array([3.075737  , 2.81963086, 2.97917199]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7fe58ec34a00>, <catboost.core.CatBoostClassifier object at 0x7fe58ebfaf40>, <catboost.core.CatBoostClassifier object at 0x7fe2f7df4a30>], 'test_score': array([0.7450225 , 0.76277159, 0.75443156])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [22]:
results_cv


Unnamed: 0,test_score_cv
0,0.745023
1,0.762772
2,0.754432


In [23]:
results_oos


Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.761169,0.630902,0.622236
1,1,0.769118,0.626714,0.622822
2,2,0.768861,0.617292,0.605646


In [24]:
results_fi


Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,23.051798,bid_ask_ratio_ex,20.019596,bid_ask_ratio_ex,19.123491
1,rel_bid_ex,15.866246,rel_bid_ex,16.432112,rel_ask_ex,18.390967
2,bid_ask_ratio_ex,9.913342,rel_ask_ex,14.244694,rel_bid_ex,16.403546
3,BEST_rel_bid,8.804311,BEST_rel_ask,8.349111,BEST_rel_ask,8.93637
4,BEST_rel_ask,8.671286,BEST_rel_bid,7.690309,BEST_ASK,7.705853
5,chg_all_lead,8.120138,BEST_ASK,7.213217,BEST_rel_bid,7.331919
6,chg_all_lag,5.66345,chg_all_lead,6.418347,ask_ex,5.34488
7,price_all_lead,3.473532,ask_ex,6.089537,chg_all_lead,4.707456
8,ask_ex,3.353614,chg_all_lag,3.418635,price_all_lead,2.566109
9,BEST_ASK,2.726363,price_all_lead,2.859645,chg_all_lag,2.538605


In [25]:
results_cv, results_oos, results_fi = evaluate(classical_features_log001p, [])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([102.1065917 , 100.89669037,  99.91211176]), 'score_time': array([2.96243501, 2.98851562, 3.13397884]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7fe59368dfa0>, <catboost.core.CatBoostClassifier object at 0x7fe50f75c2b0>, <catboost.core.CatBoostClassifier object at 0x7fe58ec34400>], 'test_score': array([0.74480302, 0.76279924, 0.75451502])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [26]:
results_cv

Unnamed: 0,test_score_cv
0,0.744803
1,0.762799
2,0.754515


In [27]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.76107,0.630885,0.622271
1,1,0.769163,0.626817,0.622776
2,2,0.768855,0.617293,0.60559


In [28]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,23.328459,bid_ask_ratio_ex,20.108328,bid_ask_ratio_ex,19.096593
1,rel_bid_ex,15.763796,rel_bid_ex,15.661923,rel_ask_ex,18.377958
2,bid_ask_ratio_ex,9.800264,rel_ask_ex,14.825913,rel_bid_ex,16.346157
3,BEST_rel_bid,8.840885,BEST_rel_ask,8.374744,BEST_rel_ask,8.896402
4,BEST_rel_ask,8.626738,BEST_rel_bid,7.704348,log001p_BEST_ASK,7.932054
5,chg_all_lead,8.109908,log001p_BEST_ASK,7.126555,BEST_rel_bid,7.349956
6,chg_all_lag,5.733701,chg_all_lead,6.418464,log001p_ask_ex,5.401388
7,log001p_price_all_lead,3.428343,log001p_ask_ex,6.173288,chg_all_lead,4.73613
8,log001p_ask_ex,3.366611,chg_all_lag,3.326272,chg_all_lag,2.567688
9,log001p_BEST_ASK,2.805901,log001p_price_all_lead,2.808172,log001p_price_all_lead,2.554534


In [29]:
results_cv, results_oos, results_fi = evaluate(classical_features_log1p, [])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([102.33898807, 100.38531828,  99.89322662]), 'score_time': array([3.10254693, 3.07740998, 2.95147371]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7fe50f111eb0>, <catboost.core.CatBoostClassifier object at 0x7fe593696040>, <catboost.core.CatBoostClassifier object at 0x7fe593696670>], 'test_score': array([0.74502078, 0.76283604, 0.7545197 ])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [30]:
results_cv

Unnamed: 0,test_score_cv
0,0.745021
1,0.762836
2,0.75452


In [31]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.761232,0.630944,0.622462
1,1,0.769239,0.626716,0.62289
2,2,0.768866,0.61728,0.605515


In [32]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,23.181056,bid_ask_ratio_ex,20.118387,bid_ask_ratio_ex,19.145014
1,rel_bid_ex,15.838437,rel_bid_ex,16.48061,rel_ask_ex,18.425378
2,bid_ask_ratio_ex,9.980983,rel_ask_ex,14.090033,rel_bid_ex,16.253023
3,BEST_rel_bid,8.798364,BEST_rel_ask,8.321073,BEST_rel_ask,8.842401
4,BEST_rel_ask,8.581989,BEST_rel_bid,7.676901,log1p_BEST_ASK,7.683042
5,chg_all_lead,8.149603,log1p_BEST_ASK,7.147943,BEST_rel_bid,7.516563
6,chg_all_lag,5.564781,chg_all_lead,6.372419,log1p_ask_ex,5.58997
7,log1p_price_all_lead,3.448028,log1p_ask_ex,6.217005,chg_all_lead,4.750893
8,log1p_ask_ex,3.399604,chg_all_lag,3.358635,log1p_price_all_lead,2.565383
9,log1p_BEST_ASK,2.722121,log1p_price_all_lead,2.857957,chg_all_lag,2.48827


In [33]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_no_transform], []
)


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([104.88055992, 104.11944747, 103.26477146]), 'score_time': array([3.02144337, 3.13049698, 3.10311294]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7fe2f7dfff70>, <catboost.core.CatBoostClassifier object at 0x7fe44946bee0>, <catboost.core.CatBoostClassifier object at 0x7fe43cc4aa00>], 'test_score': array([0.8626566 , 0.85764051, 0.83637368])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [34]:
results_cv


Unnamed: 0,test_score_cv
0,0.862657
1,0.857641
2,0.836374


In [35]:
results_oos


Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.857941,0.737277,0.711587
1,1,0.860612,0.734609,0.710465
2,2,0.8616,0.724056,0.696429


In [36]:
results_fi


Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_bid_ex,13.159532,rel_bid_ex,11.114364,bid_size_ex,10.804533
1,ask_size_ex,10.927986,bid_size_ex,9.626524,ask_size_ex,10.507648
2,bid_size_ex,10.443368,bid_ask_size_ratio_ex,9.235929,bid_ask_size_ratio_ex,10.13295
3,rel_ask_ex,9.552999,ask_size_ex,9.230751,rel_bid_ex,9.864716
4,bid_ask_size_ratio_ex,9.382373,rel_bid_size_ex,8.017286,rel_ask_ex,8.326398
5,BEST_rel_bid,6.978324,bid_ask_ratio_ex,7.827213,rel_bid_size_ex,7.616884
6,rel_bid_size_ex,6.911004,rel_ask_ex,7.545111,bid_ask_ratio_ex,7.44577
7,BEST_rel_ask,6.057092,BEST_rel_ask,6.425067,TRADE_SIZE,6.430957
8,TRADE_SIZE,5.035347,BEST_rel_bid,5.329521,BEST_rel_ask,6.143584
9,rel_ask_size_ex,4.471398,rel_ask_size_ex,5.269853,BEST_rel_bid,4.854167


In [37]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_log001p], []
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([104.44014788, 103.74602246, 103.5816412 ]), 'score_time': array([2.88003635, 2.97765803, 3.04343557]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7fe43cc44a00>, <catboost.core.CatBoostClassifier object at 0x7fe43cc44be0>, <catboost.core.CatBoostClassifier object at 0x7fe43cc44d30>], 'test_score': array([0.86247961, 0.85755979, 0.83645663])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [38]:
results_cv

Unnamed: 0,test_score_cv
0,0.86248
1,0.85756
2,0.836457


In [39]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.85783,0.737267,0.711643
1,1,0.860597,0.734721,0.710597
2,2,0.861631,0.724144,0.696849


In [40]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_bid_ex,13.126158,rel_bid_ex,11.565124,log001p_bid_size_ex,10.901702
1,log001p_ask_size_ex,11.086239,log001p_bid_size_ex,9.595085,log001p_ask_size_ex,10.362939
2,log001p_bid_size_ex,10.542186,bid_ask_size_ratio_ex,9.592776,bid_ask_size_ratio_ex,10.064492
3,bid_ask_size_ratio_ex,9.486739,log001p_ask_size_ex,8.842036,rel_bid_ex,9.965237
4,rel_ask_ex,9.404772,rel_bid_size_ex,8.186449,rel_ask_ex,8.290207
5,BEST_rel_bid,6.976375,bid_ask_ratio_ex,7.912502,rel_bid_size_ex,7.591723
6,rel_bid_size_ex,6.768478,rel_ask_ex,6.944232,bid_ask_ratio_ex,7.441483
7,BEST_rel_ask,6.09262,BEST_rel_bid,5.773347,log001p_TRADE_SIZE,6.464582
8,log001p_TRADE_SIZE,4.978777,BEST_rel_ask,5.71411,BEST_rel_ask,5.987011
9,rel_ask_size_ex,4.433736,rel_ask_size_ex,5.490093,BEST_rel_bid,4.891471


In [41]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_log1p], []
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([104.5257225 , 104.08323336, 103.02355361]), 'score_time': array([2.96458697, 3.15144491, 3.10666609]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7fe43cc4acd0>, <catboost.core.CatBoostClassifier object at 0x7fe449466f10>, <catboost.core.CatBoostClassifier object at 0x7fe50e8dc070>], 'test_score': array([0.86275368, 0.85769642, 0.83634918])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [42]:
results_cv

Unnamed: 0,test_score_cv
0,0.862754
1,0.857696
2,0.836349


In [43]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.857942,0.737349,0.711696
1,1,0.860695,0.734558,0.710565
2,2,0.861583,0.724008,0.69628


In [44]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_bid_ex,13.187916,rel_bid_ex,11.061774,log1p_bid_size_ex,10.821917
1,log1p_ask_size_ex,11.111866,log1p_bid_size_ex,9.572652,log1p_ask_size_ex,10.497698
2,log1p_bid_size_ex,10.389896,log1p_ask_size_ex,9.297098,bid_ask_size_ratio_ex,10.132422
3,bid_ask_size_ratio_ex,9.462453,bid_ask_size_ratio_ex,9.208883,rel_bid_ex,9.839814
4,rel_ask_ex,9.345654,rel_bid_size_ex,8.05643,rel_ask_ex,8.373217
5,BEST_rel_bid,7.026186,bid_ask_ratio_ex,7.910342,rel_bid_size_ex,7.566445
6,rel_bid_size_ex,6.85734,rel_ask_ex,7.527939,bid_ask_ratio_ex,7.460983
7,BEST_rel_ask,6.077419,BEST_rel_ask,6.397662,log1p_TRADE_SIZE,6.395032
8,log1p_TRADE_SIZE,4.898151,BEST_rel_bid,5.404813,BEST_rel_ask,6.117919
9,rel_ask_size_ex,4.437927,rel_ask_size_ex,5.126863,BEST_rel_bid,4.879917
