In [55]:
import json
import os
import pickle
from pathlib import Path

import gcsfs
import numpy as np
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score

In [31]:
features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

columns = [*features_classical, *features_size, "buy_sell"]

In [32]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

# unlabelled data
dataset = "fbv/thesis/ise_supervised_log_standardized:latest"

run = wandb.init(project="thesis", entity="fbv")


artifact_labelled = run.use_artifact(dataset)
artifact_dir_labelled = artifact_labelled.download()

unlabelled_dataset = dataset.replace("supervised", "unsupervised")
artifact_unlabelled = run.use_artifact(unlabelled_dataset)
artifact_dir_unlabelled = artifact_unlabelled.download()

x_train_unlabelled = pd.read_parquet(
            Path(artifact_dir_unlabelled, "train_set.parquet"), columns=columns
        )
y_train_unlabelled = x_train_unlabelled["buy_sell"]
x_train_unlabelled.drop(columns=["buy_sell"], inplace=True)

# labelled data
x_train_labelled = pd.read_parquet(
            Path(artifact_dir_labelled, "train_set.parquet"), columns=columns
        )
y_train_labelled = x_train_labelled["buy_sell"]
x_train_labelled.drop(columns=["buy_sell"], inplace=True)

# x_train = pd.concat([x_train_labelled, x_train_unlabelled])
# y_train = pd.concat([y_train_labelled, y_train_unlabelled])


[34m[1mwandb[0m: Downloading large artifact ise_supervised_log_standardized:latest, 5414.39MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact ise_unsupervised_log_standardized:latest, 1374.53MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:25.4


In [33]:
x_train_labelled

Unnamed: 0_level_0,TRADE_PRICE,bid_ex,ask_ex,BEST_ASK,BEST_BID,price_ex_lag,price_ex_lead,price_all_lag,price_all_lead,chg_ex_lead,...,chg_all_lag,prox_ex,prox_best,bid_ask_size_ratio_ex,rel_bid_size_ex,rel_ask_size_ex,TRADE_SIZE,bid_size_ex,ask_size_ex,depth_ex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.075434,-0.087708,-0.069536,-0.078667,-0.103398,-0.002769,-0.050296,-0.115317,-0.103454,0.047950,...,0.051327,0.601985,0.556092,-0.056979,-0.080504,0.050436,0.478473,0.337472,-0.750214,0.077184
1,0.435948,-1.223989,-1.288634,0.394075,0.391067,0.512701,0.650744,0.473783,0.481490,-0.279372,...,-0.032141,0.027207,2.151443,-0.078490,-0.105558,-0.108600,0.478473,-2.212652,-2.475971,0.005799
2,1.419895,1.436811,1.424212,1.420280,1.427649,1.456726,1.539712,1.420566,1.490898,-0.279372,...,0.068020,-0.355982,-0.330215,-0.075206,-0.050857,-0.055588,1.894671,0.767981,0.759048,0.005799
3,-1.081623,-1.223989,-1.048194,-1.060732,-1.244545,-1.072063,-1.035663,-1.115611,-1.096692,0.009441,...,0.017940,0.716941,0.662449,-0.078490,-0.105558,-0.071615,0.478473,-2.212652,0.055485,-0.049509
4,-1.037590,-0.985846,-0.888270,-0.938194,-0.963346,-0.899164,-0.864847,-0.942207,-0.924511,-0.048321,...,-0.032141,-1.122351,-2.102823,-0.050867,-0.104091,-0.096643,0.824408,2.027528,0.920231,1.907473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29510315,-1.127530,-1.074832,-1.101197,-1.132246,-1.094749,-1.062727,-1.026440,-1.106247,-1.087394,-0.013664,...,-0.002092,-1.122351,-1.039256,-0.073058,0.222651,0.417444,1.758366,-0.236296,-0.607759,0.016731
29510316,-0.771314,-0.749411,-0.782200,-0.793812,-0.767935,-0.637977,-0.774127,-0.680256,-0.870226,0.040248,...,-0.038818,1.176762,1.087873,-0.076848,-0.072737,-0.092697,-1.095431,-0.960311,-0.750214,-0.000632
29510317,-1.054988,-1.020555,-1.039609,-1.052116,-1.040240,-0.939860,-0.990324,-0.983021,-1.059976,0.001740,...,-0.022125,-0.202704,-0.188403,-0.076375,-0.104593,-0.107998,-1.095431,0.833145,1.078677,-0.115106
29510318,-1.118191,-1.065592,-1.092181,-1.113920,-1.085470,-1.072063,-1.035663,-1.115611,-1.096692,-0.005962,...,0.004585,-1.122351,-1.039257,-0.073642,0.222651,0.360937,1.464354,-0.402614,-0.723844,0.012230


In [42]:
x_train_unlabelled.index

Float64Index([27248602.333333332, 27248602.666666668,         27248603.2,
                      27248603.4,         27248603.6,         27248603.8,
                      27248603.0, 27248600.666666668, 27248600.333333332,
                    27248599.625,
              ...
              29510317.833333332,         29510319.0,         29510319.0,
                      29510319.0,         29510319.0,         29510319.0,
                      29510319.0,         29510319.0,         29510319.0,
                      29510319.0],
             dtype='float64', name='index_labelled', length=13069819)

In [45]:
y_train_unlabelled.index

Float64Index([27248602.333333332, 27248602.666666668,         27248603.2,
                      27248603.4,         27248603.6,         27248603.8,
                      27248603.0, 27248600.666666668, 27248600.333333332,
                    27248599.625,
              ...
              29510317.833333332,         29510319.0,         29510319.0,
                      29510319.0,         29510319.0,         29510319.0,
                      29510319.0,         29510319.0,         29510319.0,
                      29510319.0],
             dtype='float64', name='index_labelled', length=13069819)

In [35]:
y_train_labelled.head()

index
0    1
1    1
2   -1
3    1
4   -1
Name: buy_sell, dtype: int8

In [46]:
x_train = pd.concat([x_train_labelled, x_train_unlabelled])
y_train = pd.concat([y_train_labelled, y_train_unlabelled])

In [13]:
kwargs_cat = {
            "iterations": 2000,
            "grow_policy": "Lossguide",
            "border_count": 254,
            "logging_level": "Silent",
            "task_type": "GPU",
            "random_seed": 42,
            "eval_metric": "Accuracy",
            "early_stopping_rounds": 100,
}

        # callback only works for CPU, thus removed. See: https://bit.ly/3FjiuFx
clf = CatBoostClassifier(**kwargs_cat) 

In [14]:
clf.fit(x_train_labelled,y_train_labelled)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


<catboost.core.CatBoostClassifier at 0x1472e90c54c0>

In [15]:
probas = clf.predict_proba(x_train_unlabelled)

In [16]:
probas

array([[0.30712112, 0.69287888],
       [0.44133104, 0.55866896],
       [0.63892883, 0.36107117],
       ...,
       [0.91906896, 0.08093104],
       [0.92493636, 0.07506364],
       [0.89514812, 0.10485188]])

In [26]:
foo = pd.DataFrame([[1,2],[3,4]], index=[8,9])
bar = pd.DataFrame([[1,2],[3,4]], index=[7.1, 10.1])

In [27]:
foo

Unnamed: 0,0,1
8,1,2
9,3,4


In [28]:
bar

Unnamed: 0,0,1
7.1,1,2
10.1,3,4


In [29]:
mix = pd.concat([foo, bar])

mix.sort_index()

Unnamed: 0,0,1
7.1,1,2
8.0,1,2
9.0,3,4
10.1,3,4


In [None]:


# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb
features_categorical = [
    ("bin_root", 8667),
    ("bin_option_type", 2),
    ("bin_issue_type", 6),
]

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
]


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [None]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_log_standardized:v2"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

study = "fbv/thesis/xl3n4thc.optuna:v99"
artifact = run.use_artifact(study)
study_dir = artifact.download()


model = "xl3n4thc_CatBoostClassifier_default.cbm:v9"
model_name = model.split("/")[-1].split(":")[0]

artifact = run.use_artifact(model)
model_dir = artifact.download()


In [None]:
model = CatBoostClassifier()
model.load_model(fname=Path(model_dir, model_name))

# copy parameter, but overwrite device to gpu, if no gpu is available.
params = model.get_params()
# params["task_type"] = "CPU"

model_for_refit = CatBoostClassifier(**params)

## Accuracy with retraining🎯

In [None]:
X_train = pd.read_parquet(Path(data_dir, "train_set_60.parquet"), engine="fastparquet")
y_train= X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val= pd.read_parquet(Path(data_dir, "val_set_20.parquet"), engine="fastparquet")
y_val= X_val["buy_sell"]
X_val = X_val[features_classical_size]

X_retrain = pd.concat([X_train, X_val])
del X_train, X_val

y_retrain = pd.concat([y_train, y_val])
del y_train, y_val

In [None]:
weight = np.geomspace(0.001, 1, num=len(y_retrain))
# keep ordering of data
timestamp = np.linspace(0, 1, len(y_retrain))

# save to pool for faster memory access
retrain_pool = Pool(
            data=X_retrain,
            label=y_retrain,
            cat_features=None,
            weight=weight,
            timestamp=timestamp,
)

model_for_refit.fit(retrain_pool, verbose=False)

In [None]:
X_test = pd.read_parquet(Path(data_dir, "test_set_20.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]

In [None]:
acc = model_for_refit.score(X_test, y_test)
print(acc)

In [None]:
# without retraining: 0.7232624886732101
# with retraining (unweighted): 0.7294596725716052
# with retraining (weighted): 0.7393542370915156