In [None]:
!pip install --user catboost==1.1
!pip install --user gcsfs
!pip install --user ipywidgets==8.0.2
!pip install --user  numpy==1.23.4
!pip install --user pandas==1.5.1
!pip install --user scikit-learn==1.1.3
!pip install --user fastparquet
!jupyter nbextension enable --py widgetsnbextension

In [None]:
from catboost import CatBoostClassifier, Pool, metrics

import numpy as np
import pandas as pd

import os
from pathlib import Path

from sklearn.metrics import accuracy_score

from typing import List, Optional


In [None]:
features_classical_size = [
    "buy_sell",
    "TRADE_PRICE",
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "rel_ask_ex",
    "rel_bid_ex",
    "BEST_rel_bid",
    "BEST_rel_ask",
    "bid_ask_ratio_ex",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "ask_ex",
    "bid_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_all_lag",
    "price_all_lead",
    "price_ex_lag",
    "price_ex_lead",
]


In [None]:
X_train = pd.read_parquet(
    "../artifacts/classical_size_features_log_normalized:v0/train_set_60.parquet",
    engine="pyarrow",
)
y_train = X_train["buy_sell"]
X_train = X_train[features_classical_size].drop(columns="buy_sell")

X_val = pd.read_parquet(
    "../artifacts/classical_size_features_log_normalized:v0/val_set_20.parquet",
    engine="pyarrow",
)
y_val = X_val["buy_sell"]
X_val = X_val[features_classical_size].drop(columns="buy_sell")

y_train[y_train < 0] = 0
y_val[y_val < 0] = 0

train_pool = Pool(data=X_train, label=y_train, cat_features=[])
val_pool = Pool(data=X_val, label=y_val, cat_features=[])


## CatBoost Baseline 🐈‍⬛

In [None]:
params = {
    "learning_rate": 0.39676050770529875,
    "depth": 7,
    "l2_leaf_reg": 22,
    "random_strength": 1.6063676259174492e-09,
    "bagging_temperature": 0.9699098521619943,
    "grow_policy": "SymmetricTree",
    "iterations": 10000,
    "task_type": "GPU",
    "border_count": 254,
    "eval_metric": metrics.Accuracy(),
    "random_seed": 42,
    "logging_level": "Silent",
    "early_stopping_rounds": 50,
}

model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=val_pool, plot=True)


In [None]:
acc = model.score(val_pool)
print(acc)

In [None]:
eval_metrics = model.eval_metrics(Pool(X_val, y_val), [metrics.Accuracy()], plot=True)