In [2]:
import json
import os
import pickle
from pathlib import Path

import gcsfs
import google.auth
import numpy as np
import optuna
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from google.colab import auth, output
from sklearn.metrics import accuracy_score


In [3]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs_prefix = "gs://"


In [4]:


# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb
features_categorical = [
    ("bin_root", 8667),
    ("bin_option_type", 2),
    ("bin_issue_type", 6),
]

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
]


In [5]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [7]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_log_standardized:v1"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

study = "fbv/thesis/xl3n4thc.optuna:v99"
artifact = run.use_artifact(study)
study_dir = artifact.download()


model = "xl3n4thc_CatBoostClassifier_default.cbm:v9"
model_name = model.split("/")[-1].split(":")[0]

artifact = run.use_artifact(model)
model_dir = artifact.download()


[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact ise_log_standardized:v1, 3813.29MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.1
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [12]:
model = CatBoostClassifier()
model.load_model(fname=Path(model_dir, model_name))


<catboost.core.CatBoostClassifier at 0x7fa0ced2a370>

## Accuracy with retraining🎯

In [None]:
X_train = pd.read_parquet(Path(data_dir, "train_set_60.parquet"), engine="fastparquet")
y_train= X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val= pd.read_parquet(Path(data_dir, "val_set_20.parquet"), engine="fastparquet")
y_val= X_val["buy_sell"]
X_val = X_val[features_classical_size]

X_retrain = pd.concat([X_train, X_val])
del X_train, X_val

y_retrain = pd.concat([y_train, y_val])
del y_train, y_val

In [None]:
weight = np.geomspace(0.001, 1, num=len(y_retrain))
# keep ordering of data
timestamp = np.linspace(0, 1, len(y_retrain))

# save to pool for faster memory access
retrain_pool = Pool(
            data=X_retrain,
            label=y_retrain,
            cat_features=None,
            weight=weight,
            timestamp=timestamp,
)

model.fit(retrain_pool, verbose=False)

In [None]:
X_test = pd.read_parquet(Path(data_dir, "test_set_20.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]

In [None]:
acc = model.score(X_test, y_test)
print(acc)

In [None]:
# without retraining 0.7232624886732101