<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/4.0b-mb-ml_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install catboost==1.1
!pip install gcsfs==2022.10.0
!pip install ipywidgets==8.0.2
!pip install numpy==1.23.4
!pip install pandas==1.5.1
!pip install scikit-learn==1.1.3
!pip install fastparquet
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement numpy==1.23.4 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0rc1, 1.13.0rc2, 1.13.0, 1.13.1, 1.13.3, 1.14.0rc1, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0rc1, 1.15.0rc2, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0rc1, 1.16.0rc2, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0rc1, 1.17.0rc2, 1.17.0, 1.17.1, 1.17.2, 1.

Run `pip install -r requirements.txt` first to install all dependencies.

In [4]:
from catboost import CatBoostClassifier

import gcsfs
import google.auth
from google.colab import auth, output

import numpy as np
import pandas as pd

import os

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm
import wandb

from typing import List, Optional

In [5]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [6]:
test_orig = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet",
    engine="fastparquet",
)

In [7]:
features_date = [
    "date_month_sin",
    "date_month_cos",
    "date_time_sin",
    "date_time_cos",
    "date_year",
]

features_option = [
    "STRK_PRC",
    "ROOT",
    "time_to_maturity",
    "OPTION_TYPE",
]

features_trade = [
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "bid_ex",
    "ask_ex",
    "bid_size_ex",
    "ask_size_ex",
    "midpoint_ex",
    "dis_mid_ex",
    "rel_bid_size_ex",
    "rel_ask_size_ex",
    "diff_ask_bid_size_ex"
]

features_categorical = ["ROOT", "OPTION_TYPE"]

features_ml = [*features_trade, *features_date, *features_option]

X_test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/fe_w_trade_size_data/test_set_20.parquet",
    engine="fastparquet", columns=features_ml
)

## CatBoost Baseline 🐈‍⬛

In [8]:
os.mkdir("models/")

In [9]:
# see https://wandb.ai/fbv/thesis/runs/3dpde4cy
fs.get("gs://thesis-bucket-option-trade-classification/models/3dpde4cy_CatBoostClassifier_default_trial_2.cbm","./models/")

[None]

In [10]:
model = CatBoostClassifier()

model.load_model('./models/3dpde4cy_CatBoostClassifier_default_trial_2.cbm')

<catboost.core.CatBoostClassifier at 0x7f8f3152e750>

In [12]:
acc = model.score(X_test, test_orig["buy_sell"])

print(acc)

0.6984044943729075


### Robustness Baseline🥊

In [13]:
# Copy unscaled columns
X_print = test_orig.copy()
# X_print = pd.concat([train, val, test])

# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = model.predict(X_test)

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)
X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-np.inf, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-np.inf, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [14]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others randomly with equal weight for every class.
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        .assign(avg=lambda x: x.mean(axis=1))
    )
    return results


In [15]:
check_robustness("year_binned")


rule,Baseline,avg
year_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,0.679894,0.679894
2016,0.703426,0.703426
2017,0.692981,0.692981


In [16]:
check_robustness("OPTION_TYPE")


rule,Baseline,avg
OPTION_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.695186,0.695186
P,0.702081,0.702081


In [17]:
check_robustness("TRADE_SIZE_binned")


rule,Baseline,avg
TRADE_SIZE_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0,1]",0.703917,0.703917
"(1,3]",0.70819,0.70819
"(3,5]",0.70598,0.70598
"(5,11]",0.687622,0.687622
>11,0.679055,0.679055


In [18]:
check_robustness("ttm_binned")


rule,Baseline,avg
ttm_binned,Unnamed: 1_level_1,Unnamed: 2_level_1
ttm <= 1 month,0.704267,0.704267
ttm (1-2] month,0.693796,0.693796
ttm (2-3] month,0.683712,0.683712
ttm (3-6] month,0.681653,0.681653
ttm (6-12] month,0.685003,0.685003
ttm > 12 month,0.682834,0.682834
