<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/3.0-mb-feature_engineering_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost==1.1
!pip install gcsfs==2022.10.0
!pip install ipywidgets==8.0.2
!pip install numpy==1.23.4
!pip install pandas==1.5.1
!pip install optuna==3.0.3
!pip install scikit-learn==1.1.3
!pip install seaborn==0.12.1
!pip install shap==0.41.0
!pip install wandb==0.13.4

In [None]:
import os
import random

from catboost import CatBoostClassifier, Pool

import gcsfs
import google.auth
from google.colab import auth, output

import numpy as np
import pandas as pd
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import seaborn as sns
import shap

from tqdm.notebook import tqdm
import wandb


In [None]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="baseline", entity="fbv")

In [None]:
# init shap
shap.initjs()

In [None]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"


In [None]:
output.enable_custom_widget_manager()


In [None]:
# set fixed seed
def seed_everything(seed):
    """
    Seeds basic parameters for reproducibility of results
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)


seed = 42
seed_everything(seed)


In [None]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet"
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet"
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet"
)


In [None]:
train.head(10)


In [None]:
# randomly sample frac of rows
# frac = 0.02

# train = train.sample(frac=frac, random_state=seed)
# val = val.sample(frac=frac, random_state=seed)
# test = test.sample(frac=frac, random_state=seed)

# unify for common preprocessing
X = pd.concat([train, val, test])


In [None]:
# calculate days to maturity
X["time_to_maturity"] = (X["EXPIRATION"] - X["QUOTE_DATETIME"]).dt.days

In [None]:
# apply positional encoding to dates
X["date_month_sin"] = np.sin(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)
X["date_month_cos"] = np.cos(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)

# apply positional encoding to dates
X["date_month_sin"] = np.sin(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)
X["date_month_cos"] = np.cos(2 * np.pi * X["QUOTE_DATETIME"].dt.year / 12)

seconds_in_day = 24*60*60

seconds = (X["QUOTE_DATETIME"] - X["QUOTE_DATETIME"].dt.normalize()).dt.total_seconds()
X["date_time_sin"] = np.sin(2*  np.pi*seconds.dt.seconds/ seconds_in_day)
X["date_time_cos"] = np.cos(2 * np.pi*seconds.dt.seconds/ seconds_in_day)

# add year
X["date_year"] = (X["QUOTE_DATETIME"].dt.year - 2005) / (2017 - 2005)

date_columns = ["date_month_sin", "date_month_cos","date_time_sin", "date_time_cos", "date_year"]


In [None]:
X.dtypes


In [None]:
# remove problematic features -> see notebook on aversarial validation
X = X.drop(
    ["SEQUENCE_NUMBER", "order_id", "optionid", "EXPIRATION", "QUOTE_DATETIME", "ROOT", "UNDERLYING_SYMBOL"],
    axis=1,
)


In [None]:
# isolate target
y = X[["buy_sell"]]
X = X.drop(["buy_sell"], axis=1)


In [None]:
corr = X.corr()
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)


In [None]:
# manual deletion of columns that are highly correlated with other columns
X = X.drop(
    ["day_vol", "BEST_ASK", "BEST_BID", "price_all_lead", "price_all_lag"],
    axis=1,
)

In [None]:
# Midspread
mid = 0.5 * (X["ask_ex"] + X["bid_ex"])
# X["mid_ex"] = mid

In [None]:
# Absolute distance from mid
X["abs_mid_ex"] = X["TRADE_PRICE"] - mid

In [None]:
# Degree how much trade size is filled
X["rel_bid_size_ex"] = X["TRADE_SIZE"] / X["bid_size_ex"]
X["rel_ask_size_ex"] = X["TRADE_SIZE"] / X["ask_size_ex"]


In [None]:
# Calculate change similar to tick rule
X["chg_ex_lead"] = X["TRADE_PRICE"] - X["price_ex_lead"]

# Calculate change similar to reverse tick rule
X["chg_ex_lag"] = X["TRADE_PRICE"] - X["price_ex_lag"]

In [None]:
# select categorical e. g., option type and strings e. g., ticker
cat_columns = X.select_dtypes(include=["category", "object"]).columns.tolist()
print(cat_columns)

# binarize categorical similar to Borisov et al.
X[cat_columns] = X[cat_columns].apply(lambda x: pd.factorize(x)[0])


In [None]:
# treat inf as nan
X.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
# Do not fill, let CatBoost Handle NaNs
# X = X.fillna(0)

In [None]:
# separate again for training scaling
X_train = X.loc[train.index, :]
X_val = X.loc[val.index, :]
X_test = X.loc[test.index, :]

y_train = y.loc[train.index, :]
y_val = y.loc[val.index, :]
y_test = y.loc[test.index, :]


In [None]:
# Standardize numerical values
num_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()
num_columns = [x for x in num_columns if x not in cat_columns]
num_columns = [x for x in num_columns if x not in date_columns]

# use scaler due to outlying observations > dataset notebook.
scaler = StandardScaler()

X_train[num_columns] = scaler.fit_transform(X_train[num_columns])
X_val[num_columns] = scaler.transform(X_val[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])


## CatBoost Baseline 🐈‍⬛

### Hyperparameter Search Baseline🗃️

In [None]:
def objective(trial: optuna.Trial):
    # See docs for recommendations on tuning hyperparameters
    #  https://catboost.ai/en/docs/concepts/parameter-tuning
    iterations = trial.suggest_int("iterations", 10, 1000, log=False)
    learning_rate = trial.suggest_float("learning_rate", 0.005, 1, log=True)
    random_strength = trial.suggest_float("random_strength", 1e-9, 10, log=True)
    depth = trial.suggest_int("depth", 1, 8, log=False)
    grow_policy = trial.suggest_categorical(
        "grow_policy", ["SymmetricTree", "Depthwise"]
    )
    params = {
        "iterations": iterations,
        "depth": depth,
        "grow_policy": grow_policy,
        "learning_rate": learning_rate,
        "random_strength": random_strength,
        "od_type": "Iter",
        "logging_level": "Silent",
        "task_type": "GPU",
        "cat_features":cat_columns,
    }

    model = CatBoostClassifier(**params)

    model.fit(
        X_train,
        y_train,
    )

    y_pred = model.predict(X_val, prediction_type="Class")
    return accuracy_score(y_val, y_pred)


In [None]:
wandb_kwargs = {"project": "thesis"}
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)


In [None]:
# Implement hyperparameter search
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=seed),
    study_name="baseline_gbm",
)
study.optimize(objective, n_trials=200, callbacks=[wandbc])


In [None]:
ax_history = optuna.visualization.matplotlib.plot_optimization_history(study)
ax_param_importance = optuna.visualization.matplotlib.plot_param_importances(study)
fig_contour = optuna.visualization.matplotlib.plot_contour(
    study, ["iterations", "depth", "grow_policy", "learning_rate"]
)


In [None]:
print(f"Number of finished trials: {len(study.trials)}")

trial = study.best_trial

print(f"Best trial: {trial}")

print(f"Value: {trial.value}")

print("Params:")
for key, value in trial.params.items():
    print(f"{key}: {value}")


In [None]:
# use CPU to plot learning curves
# see https://catboost.ai/en/docs/concepts/python-reference_catboost_fit
static_params = {"od_type": "Iter", "logging_level": "Silent", "task_type": "CPU","cat_features":cat_columns}

params = {**static_params, **trial.params}
print(params)


In [None]:
model = CatBoostClassifier(**params)
model.fit(X_train, y_train, plot=True)


In [None]:
# use shap instead of feature importance to maintain consistency throughout the work
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X, y, cat_features=cat_columns))
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
acc_train = model.score(X_train, y_train)
acc_val = model.score(X_val, y_val)
acc_test = model.score(X_test, y_test)

print(f"Accuracy (train): {acc_train}, (val) {acc_val}, and (test) {acc_test}")


### Robustness Baseline🥊

In [None]:
# Copy unscaled columns
X_print = test.copy()

# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = 0 # model.predict(X_test)

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)
X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [0, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2005, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [0, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)

# TODO: Security type
# TODO: Moneyness
# TODO: time from previous trade; same underlying or any?


In [None]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
    )
    return results


In [None]:
check_robustness("year_binned")


In [None]:
check_robustness("OPTION_TYPE")


In [None]:
check_robustness("TRADE_SIZE_binned")


In [None]:
check_robustness("ttm_binned")


## Classical rules

In [None]:
# tick rule
# FIXME: Discuss with Grauer et al what is used in table 9 ISE. Probably all lag?
tt = np.where(X_print["TRADE_PRICE"] > X_print["price_all_lag"], 1.0, -1.0)
X_print["buy_sell_predicted"] = tt
X_print["rule"] = "tick rule"

In [None]:
check_robustness("year_binned")

In [None]:
# reverse tick rule
X_print["buy_sell_predicted"] = np.where(X_print["TRADE_PRICE"] > X_print["price_all_lag"], 1.0, -1.0)
X_print["rule"] = "reverse tick rule"

In [None]:
check_robustness("year_binned")

In [None]:
# quote rule
mid = 0.5 * (X_print["ask_ex"] + X_print["bid_ex"])
qr = np.where(X_print["TRADE_PRICE"] > mid, 1, np.where(X_print["TRADE_PRICE"] < mid, -1, np.nan))
X_print["buy_sell_predicted"] = qr

# fill others random
X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
    lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
)

X_print["rule"] = "quote rule"

In [None]:
check_robustness("year_binned")

In [None]:
# trade size tick rule
ts_eq_bid = X_print["TRADE_SIZE"] == X_print["bid_size_ex"]
ts_eq_ask = X_print["TRADE_SIZE"] == X_print["ask_size_ex"]

X_print["buy_sell_predicted"] = np.where(
    ts_eq_bid, 1.0, np.where(ts_eq_ask, -1.0, tt)
)
X_print["rule"] = "trade size + tick rule"

In [None]:
check_robustness("year_binned")

In [None]:
X_print["buy_sell_predicted"] = np.where(
    ts_eq_bid, 1.0, np.where(ts_eq_ask, -1.0, qr)
)

# fill others random
X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
    lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
)

X_print["rule"] = "trade size + quote rule"

In [None]:
check_robustness("year_binned")

In [None]:
# depth rule p. 14
dr = np.where(
    (X_print["TRADE_PRICE"] == mid) & (X_print["ask_size_ex"] > X_print["bid_size_ex"]),
    1,
    np.where(
        (X_print["TRADE_PRICE"] == mid) & (X_print["ask_size_ex"] < X_print["bid_size_ex"]), -1, np.nan
    ),
)
X_print["buy_sell_predicted"] = dr

X_print["rule"] = "depth rule"

In [None]:
X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
    lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
)

In [None]:
check_robustness("year_binned")

In [None]:
X_print["buy_sell_predicted"] = np.where(
    ts_eq_bid, 1.0, np.where(ts_eq_ask, -1.0, dr)
)

X_print["rule"] = "trade size + depth rule"

In [None]:
X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
    lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
)

In [None]:
check_robustness("year_binned")

## Classical Rules Sklearn Implementation 📦

In [None]:
import warnings
import numpy as np
import scipy.sparse as sp

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import MultiOutputMixin
from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples
from sklearn.utils.validation import check_consistent_length
from sklearn.utils.validation import check_is_fitted, _check_sample_weight
from sklearn.utils.random import _random_choice_csc
from sklearn.utils.multiclass import class_distribution


class TRClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):

    def __init__(self, *, strategy="standard", random_state=None, constant=None):
        self.strategy = strategy
        self.random_state = random_state
        self.constant = constant

    def fit(self, X, y, sample_weight=None):
        """Fit the baseline classifier.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            Target values.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.
        Returns
        -------
        self : object
            Returns the instance itself.
        """
        allowed_strategies = ("standard", "tradesize")

        if self.strategy not in allowed_strategies:
            raise ValueError(
                "Unknown strategy type: %s, expected one of %s."
                % (self.strategy, allowed_strategies)
            )

        self._strategy = self.strategy

        if self._strategy == "uniform" and sp.issparse(y):
            y = y.toarray()
            warnings.warn(
                "A local copy of the target data has been converted "
                "to a numpy array. Predicting on sparse target data "
                "with the uniform strategy would not save memory "
                "and would be slower.",
                UserWarning,
            )

        self.sparse_output_ = sp.issparse(y)

        if not self.sparse_output_:
            y = np.asarray(y)
            y = np.atleast_1d(y)

        if y.ndim == 1:
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        check_consistent_length(X, y)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)

        if self._strategy == "constant":
            if self.constant is None:
                raise ValueError(
                    "Constant target value has to be specified "
                    "when the constant strategy is used."
                )
            else:
                constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))
                if constant.shape[0] != self.n_outputs_:
                    raise ValueError(
                        "Constant target value should have shape (%d, 1)."
                        % self.n_outputs_
                    )

        (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(
            y, sample_weight
        )

        if self._strategy == "constant":
            for k in range(self.n_outputs_):
                if not any(constant[k][0] == c for c in self.classes_[k]):
                    # Checking in case of constant strategy if the constant
                    # provided by the user is in y.
                    err_msg = (
                        "The constant target value must be present in "
                        "the training data. You provided constant={}. "
                        "Possible values are: {}.".format(
                            self.constant, list(self.classes_[k])
                        )
                    )
                    raise ValueError(err_msg)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]
            self.class_prior_ = self.class_prior_[0]

        return self

    def predict(self, X):
        """Perform classification on test vectors X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test data.
        Returns
        -------
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            Predicted target values for X.
        """
        check_is_fitted(self)

        # numpy random_state expects Python int and not long as size argument
        # under Windows
        n_samples = _num_samples(X)
        rs = check_random_state(self.random_state)

        n_classes_ = self.n_classes_
        classes_ = self.classes_
        class_prior_ = self.class_prior_
        constant = self.constant
        if self.n_outputs_ == 1:
            # Get same type even for self.n_outputs_ == 1
            n_classes_ = [n_classes_]
            classes_ = [classes_]
            class_prior_ = [class_prior_]
            constant = [constant]
        # Compute probability only once
        if self._strategy == "stratified":
            proba = self.predict_proba(X)
            if self.n_outputs_ == 1:
                proba = [proba]

        if self.sparse_output_:
            class_prob = None
            if self._strategy in ("most_frequent", "prior"):
                classes_ = [np.array([cp.argmax()]) for cp in class_prior_]

            elif self._strategy == "stratified":
                class_prob = class_prior_

            elif self._strategy == "uniform":
                raise ValueError(
                    "Sparse target prediction is not "
                    "supported with the uniform strategy"
                )

            elif self._strategy == "constant":
                classes_ = [np.array([c]) for c in constant]

            y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)
        else:
            if self._strategy in ("most_frequent", "prior"):
                y = np.tile(
                    [
                        classes_[k][class_prior_[k].argmax()]
                        for k in range(self.n_outputs_)
                    ],
                    [n_samples, 1],
                )

            elif self._strategy == "stratified":
                y = np.vstack(
                    [
                        classes_[k][proba[k].argmax(axis=1)]
                        for k in range(self.n_outputs_)
                    ]
                ).T

            elif self._strategy == "uniform":
                ret = [
                    classes_[k][rs.randint(n_classes_[k], size=n_samples)]
                    for k in range(self.n_outputs_)
                ]
                y = np.vstack(ret).T

            elif self._strategy == "constant":
                y = np.tile(self.constant, (n_samples, 1))

            if self.n_outputs_ == 1:
                y = np.ravel(y)

        return y

    def predict_proba(self, X):
        pass

    def predict_log_proba(self, X):
        pass

    def _more_tags(self):
        return {
            "poor_score": True,
            "no_validation": True,
            "_xfail_checks": {
                "check_methods_subset_invariance": "fails for the predict method",
                "check_methods_sample_order_invariance": "fails for the predict method",
            },
        }

    def score(self, X, y, sample_weight=None):
        """Return the mean accuracy on the given test data and labels.
        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.
        Parameters
        ----------
        X : None or array-like of shape (n_samples, n_features)
            Test samples. Passing None as test samples gives the same result
            as passing real test samples, since DummyClassifier
            operates independently of the sampled observations.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            True labels for X.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.
        Returns
        -------
        score : float
            Mean accuracy of self.predict(X) wrt. y.
        """
        if X is None:
            X = np.zeros(shape=(len(y), 1))
        return super().score(X, y, sample_weight)



