Run on runpods.io due to memory requirements. ⚠️

In [1]:
import os
import random

from catboost import CatBoostClassifier


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_validate

from typing import List, Optional, Tuple


In [2]:
# set fixed seed
def seed_everything(seed) -> None:
    """
    Seeds basic parameters for reproducibility of results.
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    # pandas and numpy as discussed here: https://stackoverflow.com/a/52375474/5755604
    np.random.seed(seed)


seed = 42
seed_everything(seed)


# Cross-validation⛑️

In [3]:
oe_option_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_root = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)
oe_issue_type = OrdinalEncoder(
    unknown_value=-1, dtype=int, handle_unknown="use_encoded_value"
)


In [4]:
def transform(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:

    # date features
    x = pd.DataFrame(
        data={"date_year": data["QUOTE_DATETIME"].dt.year}, index=data.index
    )

    x["date_month_sin"] = np.sin(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)
    x["date_month_cos"] = np.cos(2 * np.pi * data["QUOTE_DATETIME"].dt.year / 12)

    seconds_in_day = 24 * 60 * 60
    seconds = (
        data["QUOTE_DATETIME"] - data["QUOTE_DATETIME"].dt.normalize()
    ).dt.total_seconds()

    x["date_time_sin"] = np.sin(2 * np.pi * seconds / seconds_in_day)
    x["date_time_cos"] = np.cos(2 * np.pi * seconds / seconds_in_day)

    # option features
    x["ttm"] = (
        data["EXPIRATION"].dt.to_period("M") - data["QUOTE_DATETIME"].dt.to_period("M")
    ).apply(lambda x: x.n)
    x[["myn", "day_vol"]] = data[["myn", "day_vol"]]
    # FIXME: make consistent later
    x["log_strk_prc"] = np.log1p(data["STRK_PRC"])

    # binarize
    # "bin_OPTION_TYPE", "bin_issue_type", "bin_ROOT",

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x[["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]] = data[
        ["TRADE_SIZE", "bid_size_ex", "ask_size_ex"]
    ]

    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])
    x["rel_ask_ex"] = (data["TRADE_PRICE"] - mid_ex) / (data["ask_ex"] - mid_ex)
    x["rel_bid_ex"] = (mid_ex - data["TRADE_PRICE"]) / (mid_ex - data["bid_ex"])
    x["BEST_rel_bid"] = (data["TRADE_PRICE"] - mid_best) / (data["BEST_ASK"] - mid_best)
    x["BEST_rel_ask"] = (mid_best - data["TRADE_PRICE"]) / (mid_best - data["BEST_BID"])
    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]

    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # no transform
    x[
        [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
        ]
    ] = data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
    ]


    # log(x + 0.01)
    x[
        [
            "log001p_ask_ex",
            "log001p_bid_ex",
            "log001p_BEST_ASK",
            "log001p_BEST_BID",
            "log001p_trade_price",
            "log001p_price_all_lag",
            "log001p_price_all_lead",
            "log001p_price_ex_lag",
            "log001p_price_ex_lead",
            "log001p_TRADE_SIZE",
            "log001p_bid_size_ex",
            "log001p_ask_size_ex",
        ]
    ] = np.log(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
        ]
        + 1e-2
    )

    # log(x + 1)
    x[
        [
            "log1p_ask_ex",
            "log1p_bid_ex",
            "log1p_BEST_ASK",
            "log1p_BEST_BID",
            "log1p_trade_price",
            "log1p_price_all_lag",
            "log1p_price_all_lead",
            "log1p_price_ex_lag",
            "log1p_price_ex_lead",
            "log1p_TRADE_SIZE",
            "log1p_bid_size_ex",
            "log1p_ask_size_ex",
        ]
    ] = np.log1p(
        data[
            [
                "ask_ex",
                "bid_ex",
                "BEST_ASK",
                "BEST_BID",
                "TRADE_PRICE",
                "price_all_lag",
                "price_all_lead",
                "price_ex_lag",
                "price_ex_lead",
                "TRADE_SIZE",
                "bid_size_ex",
                "ask_size_ex",
            ]
        ]
    )

    # https://stackoverflow.com/questions/70727291/how-do-i-know-whether-a-sklearn-scaler-is-already-fitted-or-not

    if not hasattr(oe_option_type, "n_features_in_"):
        oe_option_type.fit(data["OPTION_TYPE"].astype(str).values.reshape(-1, 1))
    x["bin_option_type"] = oe_option_type.transform(
        data["OPTION_TYPE"].astype(str).values.reshape(-1, 1)
    )

    if not hasattr(oe_root, "n_features_in_"):
        oe_root.fit(data["ROOT"].astype(str).values.reshape(-1, 1))
    x["bin_root"] = oe_root.transform(data["ROOT"].astype(str).values.reshape(-1, 1))

    if not hasattr(oe_issue_type, "n_features_in_"):
        oe_issue_type.fit(data["issue_type"].astype(str).values.reshape(-1, 1))
    x["bin_issue_type"] = oe_issue_type.transform(
        data["issue_type"].astype(str).values.reshape(-1, 1)
    )

    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    # fill all na with 0
    x.fillna(0, inplace=True)
    y = data["buy_sell"]
    return x, y


In [5]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet"
)
x_train, y_train = transform(train)

del train




In [6]:
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet"
)
x_val, y_val = transform(val)

del val


In [7]:
test = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet"
)
x_test, y_test = transform(test)

del test


In [8]:
classical_features_log001p = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "log001p_ask_ex",
    "log001p_bid_ex",
    "log001p_BEST_ASK",
    "log001p_BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "log001p_trade_price",
    "log001p_price_all_lag",
    "log001p_price_all_lead",
    "log001p_price_ex_lag",
    "log001p_price_ex_lead",
]

classical_features_log1p = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "log1p_ask_ex",
    "log1p_bid_ex",
    "log1p_BEST_ASK",
    "log1p_BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "log1p_trade_price",
    "log1p_price_all_lag",
    "log1p_price_all_lead",
    "log1p_price_ex_lag",
    "log1p_price_ex_lead",
]

classical_features_no_transform = [
    "BEST_rel_bid",
    "BEST_rel_ask",
    "rel_ask_ex",
    "rel_bid_ex",
    "bid_ask_ratio_ex",
    "ask_ex",
    "bid_ex",
    "BEST_ASK",
    "BEST_BID",
    "chg_ex_lag",
    "chg_ex_lead",
    "chg_all_lag",
    "chg_all_lead",
    "TRADE_PRICE",
    "price_all_lag",
    "price_all_lead",
    "price_ex_lag",
    "price_ex_lead",
]


size_features_no_transform = [
    "TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "bid_size_ex",
    "ask_size_ex",
]

size_features_log001p= [
    "log001p_TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "log001p_bid_size_ex",
    "log001p_ask_size_ex",
]


size_features_log1p= [
    "log1p_TRADE_SIZE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "log1p_bid_size_ex",
    "log1p_ask_size_ex",
]



In [9]:
def evaluate(
    features: List[str], cat_features: Optional[List[str]]
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    params = {
        "od_type": "Iter",
        "logging_level": "Silent",
        "depth": 8,
        "loss_function": "Logloss",
        "task_type": "GPU",
        "cat_features": cat_features,
        "random_seed": 42,
    }

    res = cross_validate(
        CatBoostClassifier(**params),
        x_train[features],
        y_train,
        cv=3,
        return_estimator=True,
    )
    results_cv = pd.DataFrame(res["test_score"], columns=["test_score_cv"])
    print(res)

    oos = []
    feature_importances = []
    for i, model in enumerate(res["estimator"]):

        oos.append(
            [
                i,
                model.score(x_train[features], y_train),
                model.score(x_val[features], y_val),
                model.score(x_test[features], y_test),
            ]
        )

        feature_importance = model.get_feature_importance(prettified=True).add_prefix(
            f"fold_{i}_"
        )
        feature_importances.append(feature_importance)

    results_oos = pd.DataFrame(
        data=oos, columns=["fold", "acc_train", "acc_val", "acc_test"]
    )
    results_fi = pd.concat(feature_importances, axis=1)

    return results_cv, results_oos, results_fi


In [10]:
results_cv, results_oos, results_fi = evaluate(classical_features_no_transform, [])


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([101.83080363, 100.52312589, 100.34640288]), 'score_time': array([2.9326396 , 3.06439018, 2.83942509]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7f1906ade6d0>, <catboost.core.CatBoostClassifier object at 0x7f1904f8f910>, <catboost.core.CatBoostClassifier object at 0x7f1904f8f070>], 'test_score': array([0.74423658, 0.76255363, 0.75422062])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [11]:
results_cv


Unnamed: 0,test_score_cv
0,0.744237
1,0.762554
2,0.754221


In [12]:
results_oos


Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.760781,0.630504,0.621895
1,1,0.768801,0.626294,0.622492
2,2,0.768717,0.617139,0.605304


In [13]:
results_fi


Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,22.990729,bid_ask_ratio_ex,19.897145,rel_ask_ex,19.172873
1,rel_bid_ex,15.813432,rel_ask_ex,15.884092,bid_ask_ratio_ex,19.000829
2,bid_ask_ratio_ex,9.893765,rel_bid_ex,15.421176,rel_bid_ex,16.397705
3,BEST_rel_bid,9.480421,BEST_rel_ask,7.872446,BEST_rel_bid,7.884684
4,BEST_rel_ask,8.39297,BEST_rel_bid,7.820222,BEST_rel_ask,7.689974
5,chg_all_lead,7.629985,BEST_ASK,7.106836,BEST_ASK,7.639769
6,chg_all_lag,5.351401,chg_all_lead,6.482494,ask_ex,5.098739
7,ask_ex,3.579094,ask_ex,5.556844,chg_all_lead,4.643592
8,BEST_ASK,3.299194,chg_all_lag,3.64182,chg_all_lag,2.595599
9,bid_ex,2.691742,price_all_lead,2.448944,price_all_lead,2.026251


In [14]:
results_cv, results_oos, results_fi = evaluate(classical_features_log001p, [])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([101.43279958, 100.16543198,  99.77901578]), 'score_time': array([2.73329878, 2.88876295, 3.0164969 ]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7f1904fe4d60>, <catboost.core.CatBoostClassifier object at 0x7f17b8a7ef10>, <catboost.core.CatBoostClassifier object at 0x7f191417de80>], 'test_score': array([0.7445813 , 0.76267186, 0.75444579])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [15]:
results_cv

Unnamed: 0,test_score_cv
0,0.744581
1,0.762672
2,0.754446


In [16]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.760807,0.63014,0.621108
1,1,0.768993,0.625733,0.621746
2,2,0.768844,0.61721,0.605375


In [17]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,22.51826,bid_ask_ratio_ex,20.307738,rel_ask_ex,20.051904
1,rel_bid_ex,16.590644,rel_ask_ex,16.330243,bid_ask_ratio_ex,19.261921
2,bid_ask_ratio_ex,10.08235,rel_bid_ex,14.696587,rel_bid_ex,15.164447
3,BEST_rel_bid,9.098977,BEST_rel_ask,8.074895,BEST_rel_ask,8.172676
4,BEST_rel_ask,8.888999,BEST_rel_bid,7.546045,BEST_rel_bid,7.864983
5,chg_all_lead,7.55855,chg_all_lead,6.673689,log001p_BEST_ASK,6.605035
6,chg_all_lag,5.381957,log001p_BEST_ASK,6.652097,log001p_ask_ex,6.549743
7,log001p_bid_ex,3.872086,log001p_ask_ex,6.265979,chg_all_lead,4.830204
8,log001p_ask_ex,3.610084,chg_all_lag,3.480973,log001p_bid_ex,2.624055
9,log001p_BEST_ASK,2.634967,log001p_bid_ex,3.015009,chg_all_lag,2.407413


In [18]:
results_cv, results_oos, results_fi = evaluate(classical_features_log1p, [])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([101.30106139, 100.45461416,  99.7327745 ]), 'score_time': array([2.8108542 , 2.97966194, 2.71104765]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7f17af81d8e0>, <catboost.core.CatBoostClassifier object at 0x7f17af81dc10>, <catboost.core.CatBoostClassifier object at 0x7f1873daa8e0>], 'test_score': array([0.74417263, 0.76275502, 0.75427917])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [19]:
results_cv

Unnamed: 0,test_score_cv
0,0.744173
1,0.762755
2,0.754279


In [20]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.760689,0.63041,0.621853
1,1,0.768939,0.626289,0.622506
2,2,0.768755,0.617367,0.605534


In [21]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,22.992115,bid_ask_ratio_ex,19.913359,rel_ask_ex,19.104817
1,rel_bid_ex,15.893097,rel_ask_ex,15.909112,bid_ask_ratio_ex,18.987904
2,bid_ask_ratio_ex,9.850483,rel_bid_ex,15.302069,rel_bid_ex,16.257441
3,BEST_rel_bid,9.511851,BEST_rel_bid,7.923958,BEST_rel_bid,7.960123
4,BEST_rel_ask,8.324598,BEST_rel_ask,7.831309,BEST_rel_ask,7.812259
5,chg_all_lead,7.633459,log1p_BEST_ASK,7.146806,log1p_BEST_ASK,7.660703
6,chg_all_lag,5.380802,chg_all_lead,6.456127,log1p_ask_ex,5.102793
7,log1p_ask_ex,3.503737,log1p_ask_ex,5.458561,chg_all_lead,4.689283
8,log1p_BEST_ASK,3.351737,chg_all_lag,3.624757,chg_all_lag,2.565721
9,log1p_bid_ex,2.677881,log1p_price_all_lead,2.426429,log1p_price_all_lead,2.030976


In [22]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_no_transform], []
)


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([104.58673477, 103.29419017, 101.88424706]), 'score_time': array([3.14695215, 3.14843535, 3.09839463]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7f17b8a7e7f0>, <catboost.core.CatBoostClassifier object at 0x7f17c7873ee0>, <catboost.core.CatBoostClassifier object at 0x7f17af81d430>], 'test_score': array([0.86247646, 0.85755816, 0.83622475])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [23]:
results_cv


Unnamed: 0,test_score_cv
0,0.862476
1,0.857558
2,0.836225


In [24]:
results_oos


Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.857773,0.736869,0.71138
1,1,0.860659,0.734654,0.710238
2,2,0.861461,0.724263,0.696201


In [25]:
results_fi


Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,12.612576,rel_bid_ex,10.182475,rel_ask_ex,11.326076
1,ask_size_ex,11.493982,rel_ask_ex,9.384389,bid_size_ex,10.943411
2,bid_size_ex,10.144316,bid_ask_size_ratio_ex,9.135271,ask_size_ex,10.518842
3,rel_bid_ex,9.949764,ask_size_ex,8.671358,bid_ask_size_ratio_ex,9.905341
4,bid_ask_size_ratio_ex,9.533182,bid_size_ex,8.640208,rel_bid_ex,8.396384
5,BEST_rel_bid,6.963347,rel_bid_size_ex,8.229752,rel_bid_size_ex,7.526561
6,rel_bid_size_ex,6.812533,bid_ask_ratio_ex,7.792101,bid_ask_ratio_ex,7.419689
7,BEST_rel_ask,6.245576,BEST_rel_ask,6.609483,TRADE_SIZE,6.297405
8,TRADE_SIZE,4.669964,TRADE_SIZE,5.480719,BEST_rel_ask,5.368463
9,rel_ask_size_ex,4.384364,rel_ask_size_ex,5.457203,BEST_rel_bid,4.796347


In [26]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_log001p], []
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([104.1224792 , 103.63906527, 101.74330044]), 'score_time': array([3.05360603, 3.16042089, 3.04070044]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7f17b8a7e4f0>, <catboost.core.CatBoostClassifier object at 0x7f17c786ffa0>, <catboost.core.CatBoostClassifier object at 0x7f191417dd30>], 'test_score': array([0.8623199 , 0.85761296, 0.83622373])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [27]:
results_cv

Unnamed: 0,test_score_cv
0,0.86232
1,0.857613
2,0.836224


In [28]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.857678,0.736742,0.711212
1,1,0.860714,0.734679,0.709971
2,2,0.861425,0.724213,0.695957


In [29]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,12.641688,rel_bid_ex,9.902355,rel_ask_ex,11.243654
1,log001p_ask_size_ex,11.408432,rel_ask_ex,9.740444,log001p_bid_size_ex,10.971849
2,log001p_bid_size_ex,10.191568,bid_ask_size_ratio_ex,9.133761,log001p_ask_size_ex,10.501408
3,rel_bid_ex,9.973127,log001p_ask_size_ex,8.815559,bid_ask_size_ratio_ex,9.89022
4,bid_ask_size_ratio_ex,9.544057,log001p_bid_size_ex,8.779536,rel_bid_ex,8.509962
5,BEST_rel_bid,6.874092,rel_bid_size_ex,7.997798,rel_bid_size_ex,7.508173
6,rel_bid_size_ex,6.764436,bid_ask_ratio_ex,7.746922,bid_ask_ratio_ex,7.3961
7,BEST_rel_ask,6.312749,BEST_rel_ask,6.224303,log001p_TRADE_SIZE,6.33555
8,log001p_TRADE_SIZE,4.704662,BEST_rel_bid,5.587097,BEST_rel_ask,5.386723
9,rel_ask_size_ex,4.394143,log001p_TRADE_SIZE,5.504333,BEST_rel_bid,4.762067


In [30]:
results_cv, results_oos, results_fi = evaluate(
    [*classical_features_no_transform, *size_features_log1p], []
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


{'fit_time': array([104.16563058, 103.52602983, 101.98182058]), 'score_time': array([3.04467106, 2.9350853 , 3.07836866]), 'estimator': [<catboost.core.CatBoostClassifier object at 0x7f17c7873100>, <catboost.core.CatBoostClassifier object at 0x7f17c786ffd0>, <catboost.core.CatBoostClassifier object at 0x7f191417d4f0>], 'test_score': array([0.86242593, 0.8575175 , 0.8361852 ])}


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

In [31]:
results_cv

Unnamed: 0,test_score_cv
0,0.862426
1,0.857518
2,0.836185


In [32]:
results_oos

Unnamed: 0,fold,acc_train,acc_val,acc_test
0,0,0.857722,0.736836,0.711281
1,1,0.860661,0.734597,0.710053
2,2,0.861418,0.724236,0.696194


In [33]:
results_fi

Unnamed: 0,fold_0_Feature Id,fold_0_Importances,fold_1_Feature Id,fold_1_Importances,fold_2_Feature Id,fold_2_Importances
0,rel_ask_ex,12.688185,rel_bid_ex,10.082419,rel_ask_ex,11.34416
1,log1p_ask_size_ex,11.577071,rel_ask_ex,9.590845,log1p_bid_size_ex,10.966773
2,rel_bid_ex,10.145543,bid_ask_size_ratio_ex,9.150461,log1p_ask_size_ex,10.532778
3,log1p_bid_size_ex,10.110876,log1p_ask_size_ex,8.791152,bid_ask_size_ratio_ex,9.914168
4,bid_ask_size_ratio_ex,9.51841,log1p_bid_size_ex,8.758232,rel_bid_ex,8.399537
5,BEST_rel_bid,6.914585,rel_bid_size_ex,7.977102,rel_bid_size_ex,7.504351
6,rel_bid_size_ex,6.842888,bid_ask_ratio_ex,7.74828,bid_ask_ratio_ex,7.386279
7,BEST_rel_ask,6.140031,BEST_rel_ask,6.253999,log1p_TRADE_SIZE,6.301617
8,log1p_TRADE_SIZE,4.539013,BEST_rel_bid,5.620359,BEST_rel_ask,5.36086
9,rel_ask_size_ex,4.399959,log1p_TRADE_SIZE,5.494301,BEST_rel_bid,4.827735
