In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import (
    features_categorical,
    features_classical,
)

In [None]:
# set globally here
EXCHANGE = "ise"  # "cboe"
STRATEGY = "supervised"  # "transfer"
SUBSET = "test"  # "all"

In [None]:
# key used for files and artefacts
key = f"{EXCHANGE}_gbm_{STRATEGY}_{SUBSET}_viz"
dataset = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest"

In [None]:
# set project name. Required to access files and artefacts
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

In [None]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()

In [None]:
train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet")
y_train = train["buy_sell"]
X_train = train.drop(columns="buy_sell")
timestamp_train = np.linspace(0, 1, len(y_train))
weights_exp_train = np.geomspace(0.001, 1, num=len(y_train))

cat_features_sub = [
    tup[0] for tup in features_categorical if tup[0] in features_classical
]

train_pool_uni = Pool(
    data=X_train.loc[:, features_classical],
    label=y_train,
    cat_features=cat_features_sub,
    timestamp=timestamp_train,
)

train_pool_exp = Pool(
    data=X_train.loc[:, features_classical],
    label=y_train,
    cat_features=cat_features_sub,
    timestamp=timestamp_train,
    weight=weights_exp_train,
)

val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet")
y_val = val["buy_sell"]
X_val = val.drop(columns="buy_sell")
timestamp_val = np.linspace(0, 1, len(y_val))

val_pool_uni = Pool(
    data=X_val.loc[:, features_classical],
    label=y_val,
    cat_features=cat_features_sub,
    timestamp=timestamp_val,
)

## Distribution Of Loss

In [None]:
kwargs_shared = {
    "logging_level": "Silent",
    "task_type": "GPU",
    "random_seed": 42,
    "eval_metric": "Accuracy",
}

settings = [
    {"iterations": 5},
    {"iterations": 100},
    {"iterations": 1000},
    {"iterations": 2000},
]
[setting.update(kwargs_shared) for setting in settings]

results = []

for setting in tqdm(settings):
    clf = CatBoostClassifier(**setting)
    clf.fit(train_pool_uni, eval_set=val_pool_uni)

    proba_predictions = clf.predict_proba(val_pool_uni)
    positive_class_prob = proba_predictions[:, 1]
    y_val_mapped = (y_val + 1) // 2

    result = -np.log(positive_class_prob) * y_val_mapped - np.log(
        1 - positive_class_prob
    ) * (1 - y_val_mapped)
    results.append(result)

In [None]:
dfs = pd.concat(results, axis=1, keys=["iter_5", "iter_100", "iter_1000", "iter_2000"])
key = f"{EXCHANGE}_gbm_{STRATEGY}_{SUBSET}_viz_dist_loss"

output_path = f"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-dist-loss.parquet"
dfs.columns = ["_".join(col).rstrip("_") for col in dfs.columns.values]
dfs.to_parquet(output_path)

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()

## CatBoost 🐈‍⬛

In [None]:
kwargs_depth = {"depth": 12}


kwargs_earl_stopping = {
    "early_stopping_rounds": 100,
}

kwargs_growth_strategy = {
    "grow_policy": "Lossguide",
}


kwargs_border_count = {
    "border_count": 254,
}

kwargs_shared = {
    "iterations": 2000,
    "logging_level": "Silent",
    "task_type": "GPU",
    "random_seed": 42,
    "eval_metric": "Accuracy",
}


# complete config
settings = [
    {},
    kwargs_depth,
    kwargs_earl_stopping,
    kwargs_border_count,
    kwargs_growth_strategy,
    {},
]
[setting.update(kwargs_shared) for setting in settings]
# set pools
pools = [
    train_pool_uni,
    train_pool_uni,
    train_pool_uni,
    train_pool_uni,
    train_pool_uni,
    train_pool_exp,
]
identifier = [
    "default",
    "depth",
    "early_stopping",
    "border_count",
    "grow_policy",
    "exp_weighting",
]

In [None]:
print(settings)

In [None]:
results = []

for i, setting in enumerate(tqdm(settings)):
    clf = CatBoostClassifier(**setting)
    clf.fit(pools[i], eval_set=val_pool_uni)
    result = clf.get_evals_result()
    results.append({identifier[i]: result})

In [None]:
dfs = []

for result in results:
    key = list(result.keys())[0]

    learn_acc = result[key]["learn"]["Accuracy"]
    learn_log = result[key]["learn"]["Logloss"]
    val_acc = result[key]["validation"]["Accuracy"]
    val_log = result[key]["validation"]["Logloss"]

    df = pd.DataFrame(
        {
            "learn_acc": learn_acc,
            "learn_log": learn_log,
            "val_acc": val_acc,
            "val_log": val_log,
        }
    )
    df.name = key
    dfs.append(df)

In [None]:
dfs = pd.concat(dfs, axis=1, keys=identifier)

output_path = f"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-losses.parquet"
dfs.columns = ["_".join(col).rstrip("_") for col in dfs.columns.values]
dfs.to_parquet(output_path)

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=key, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()