In [2]:
import json
import os
import pickle
from pathlib import Path

import gcsfs
import google.auth
import numpy as np
import optuna
import pandas as pd
import wandb
from catboost import CatBoostClassifier
from google.colab import auth, output
from sklearn.metrics import accuracy_score


In [3]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs_prefix = "gs://"


In [4]:
features_date = [
    "date_month_sin",
    "date_month_cos",
    "date_time_sin",
    "date_time_cos",
    "date_weekday_sin",
    "date_weekday_cos",
    "date_day_sin",
    "date_day_cos",
]

features_option = [
    "STRK_PRC",
    "ttm",
    "bin_option_type",
    "bin_issue_type",
    "bin_root",
    "myn",
    "day_vol",
]

# https://github.com/KarelZe/thesis/blob/main/notebooks/
# 3.0a-mb-explanatory_data_analysis.ipynb
features_categorical = [
    ("bin_root", 8667),
    ("bin_option_type", 2),
    ("bin_issue_type", 6),
]

features_classical = [
    "TRADE_PRICE",
    "bid_ex",
    "ask_ex",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "chg_ex_lead",
    "chg_ex_lag",
    "chg_all_lead",
    "chg_all_lag",
    "prox_ex",
    "prox_best",
]

features_size = [
    "bid_ask_size_ratio_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "TRADE_SIZE",
    "bid_size_ex",
    "ask_size_ex",
    "depth_ex",
]

features_classical_size = [
    *features_classical,
    *features_size,
]

features_ml = [*features_classical_size, *features_date, *features_option]

features_unused = [
    "price_rel_nbb",
    "price_rel_nbo",
    "date_year",
    "mid_ex",
    "mid_best",
    "spread_ex",
    "spread_best",
]


In [5]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [7]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_log_standardized:v1"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

study = "fbv/thesis/xl3n4thc.optuna:v99"
artifact = run.use_artifact(study)
study_dir = artifact.download()


model = "xl3n4thc_CatBoostClassifier_default.cbm:v9"
model_name = model.split("/")[-1].split(":")[0]

artifact = run.use_artifact(model)
model_dir = artifact.download()


[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact ise_log_standardized:v1, 3813.29MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.1
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [8]:
X_test = pd.read_parquet(Path(data_dir, "test_set_20.parquet"), engine="fastparquet")

y_test = X_test["buy_sell"]
# y_test[y_test<0] = 0
X_test = X_test[features_classical_size]


## CatBoost Baseline 🐈‍⬛

## Visualize study

In [9]:
%%script false --no-raise-error
file  = open("./artifacts/xl3n4thc.optuna:v99/xl3n4thc.optuna",'rb')
study = pickle.load(file)

#optuna.visualization.matplotlib.plot_optimization_history(study)
optuna.visualization.matplotlib.plot_param_importances(study)
optuna.visualization.matplotlib.plot_slice(study)
optuna.visualization.matplotlib.plot_contour(
     study, ["learning_rate", "depth", "bagging_temperature", "l2_leaf_reg"]
)

### Learning Curves Baseline 📉

In [10]:
%%script false --no-raise-error
# visualize learning curves
with open(Path(model_dir,model_name[:-4]+"_training.json"), 'r') as j:
     contents = json.loads(j.read())

# extract relevant keys
test_metrics = [d['name'] for d in contents['meta']['test_metrics'] ]
test_results = [d['test'] for d in iterations]
learn_metrics = [d['name'] for d in contents['meta']['learn_metrics'] ]
learn_results = [d['learn'] for d in iterations]

metrics_learn = pd.DataFrame(learn_results, columns=learn_metrics).add_prefix("learn_")
metrics_test = pd.DataFrame(test_results, columns=test_metrics).add_prefix("test_")

learning_metrics = pd.concat([metrics_learn, metrics_test], axis=1)

df.head()

In [11]:
%%script false --no-raise-error
learning_metrics.plot(kind="line", figsize=(16,9))

### Accuracy Baseline 🎯

In [12]:
model = CatBoostClassifier()
model.load_model(fname=Path(model_dir, model_name))


<catboost.core.CatBoostClassifier at 0x7fa0ced2a370>

In [13]:
acc = model.score(X_test, y_test)
print(acc)


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0.7232624886732101


### Robustness Baseline🥊

In [14]:
# load default data to use unscaled version with all possible columns
X_print = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet",
    engine="fastparquet",
    columns=[
        "EXPIRATION",
        "QUOTE_DATETIME",
        "OPTION_TYPE",
        "TRADE_SIZE",
        "myn",
        "buy_sell",
        "issue_type",
    ],
)


In [15]:
X_print.head()


Unnamed: 0,EXPIRATION,QUOTE_DATETIME,OPTION_TYPE,TRADE_SIZE,myn,buy_sell,issue_type
39342171,2015-12-18,2015-11-06 09:30:00,P,2,0.921659,-1,%
39342172,2015-11-27,2015-11-06 09:30:00,C,1,1.001696,-1,0
39342173,2017-01-20,2015-11-06 09:30:00,C,1,0.886115,-1,0
39342174,2015-11-20,2015-11-06 09:30:00,C,1,1.009261,1,%
39342175,2015-12-18,2015-11-06 09:30:00,C,1,0.99188,-1,%


In [16]:
# Copy unscaled columns
X_print = X_print.copy()
# X_print["buy_sell"] = (X_print["buy_sell"] > 0).astype(int)

# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = model.predict(X_test)

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)
X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-np.inf, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-np.inf, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)


bins_myn = [-np.inf, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "mny <=0.7",
    "mny (0.7-0.9]",
    "mny (0.9-1.1]",
    "mny (1.1-1.3]",
    "mny > 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

X_print["issue_type_binned"] = X_print["issue_type"].replace(
    {
        "0": "Stock options",
        "A": "Index options",
        "7": "Others",
        "F": "Others",
        "%": "Others",
        " ": "Others",
    }
)


# TODO: time from previous trade; same underlying or any?


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [17]:
X_print.head()


Unnamed: 0,EXPIRATION,QUOTE_DATETIME,OPTION_TYPE,TRADE_SIZE,myn,buy_sell,issue_type,rule,buy_sell_predicted,ttm,year,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,issue_type_binned
39342171,2015-12-18,2015-11-06 09:30:00,P,2,0.921659,-1,%,Baseline,-1,1,2015,"(1,3]",2015,ttm <= 1 month,mny (0.9-1.1],Others
39342172,2015-11-27,2015-11-06 09:30:00,C,1,1.001696,-1,0,Baseline,-1,0,2015,"(0,1]",2015,ttm <= 1 month,mny (0.9-1.1],Stock options
39342173,2017-01-20,2015-11-06 09:30:00,C,1,0.886115,-1,0,Baseline,-1,14,2015,"(0,1]",2015,ttm > 12 month,mny (0.7-0.9],Stock options
39342174,2015-11-20,2015-11-06 09:30:00,C,1,1.009261,1,%,Baseline,1,0,2015,"(0,1]",2015,ttm <= 1 month,mny (0.9-1.1],Others
39342175,2015-12-18,2015-11-06 09:30:00,C,1,0.99188,-1,%,Baseline,-1,1,2015,"(0,1]",2015,ttm <= 1 month,mny (0.9-1.1],Others


In [18]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others randomly with equal weight for every class.
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([0, 1])
    )

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        .assign(avg=lambda x: x.mean(axis=1))
    )
    return results


In [19]:
X_print.head()


Unnamed: 0,EXPIRATION,QUOTE_DATETIME,OPTION_TYPE,TRADE_SIZE,myn,buy_sell,issue_type,rule,buy_sell_predicted,ttm,year,TRADE_SIZE_binned,year_binned,ttm_binned,myn_binned,issue_type_binned
39342171,2015-12-18,2015-11-06 09:30:00,P,2,0.921659,-1,%,Baseline,-1,1,2015,"(1,3]",2015,ttm <= 1 month,mny (0.9-1.1],Others
39342172,2015-11-27,2015-11-06 09:30:00,C,1,1.001696,-1,0,Baseline,-1,0,2015,"(0,1]",2015,ttm <= 1 month,mny (0.9-1.1],Stock options
39342173,2017-01-20,2015-11-06 09:30:00,C,1,0.886115,-1,0,Baseline,-1,14,2015,"(0,1]",2015,ttm > 12 month,mny (0.7-0.9],Stock options
39342174,2015-11-20,2015-11-06 09:30:00,C,1,1.009261,1,%,Baseline,1,0,2015,"(0,1]",2015,ttm <= 1 month,mny (0.9-1.1],Others
39342175,2015-12-18,2015-11-06 09:30:00,C,1,0.99188,-1,%,Baseline,-1,1,2015,"(0,1]",2015,ttm <= 1 month,mny (0.9-1.1],Others


In [20]:
check_robustness("year_binned")


rule,Baseline,avg
year_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,0.690118,0.690118
2016,0.725174,0.725174
2017,0.729987,0.729987


In [21]:
check_robustness("OPTION_TYPE")


rule,Baseline,avg
OPTION_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.718712,0.718712
P,0.72846,0.72846


In [22]:
check_robustness("issue_type_binned")


rule,Baseline,avg
issue_type_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
Index options,0.574215,0.574215
Others,0.763456,0.763456
Stock options,0.709328,0.709328


In [23]:
check_robustness("TRADE_SIZE_binned")


rule,Baseline,avg
TRADE_SIZE_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0,1]",0.725341,0.725341
"(1,3]",0.729509,0.729509
"(3,5]",0.727483,0.727483
"(5,11]",0.717682,0.717682
>11,0.712972,0.712972


In [24]:
check_robustness("ttm_binned")


rule,Baseline,avg
ttm_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
ttm <= 1 month,0.727325,0.727325
ttm (1-2] month,0.728159,0.728159
ttm (2-3] month,0.718036,0.718036
ttm (3-6] month,0.712758,0.712758
ttm (6-12] month,0.712268,0.712268
ttm > 12 month,0.685955,0.685955


In [25]:
check_robustness("myn_binned")


rule,Baseline,avg
myn_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
mny <=0.7,0.718651,0.718651
mny (0.7-0.9],0.742584,0.742584
mny (0.9-1.1],0.729493,0.729493
mny (1.1-1.3],0.662484,0.662484
mny > 1.3,0.630423,0.630423
