In [None]:
import os
from pathlib import Path
from typing import List, Optional

import gcsfs
import google.auth
import numpy as np
import pandas as pd
import torch
import wandb
from google.colab import auth, output
from sklearn.metrics import accuracy_score
from torch import nn
from tqdm.notebook import tqdm


In [None]:
# connect to google cloud storage
# auth.authenticate_user()
#credentials, _ = google.auth.default()
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
fs = gcsfs.GCSFileSystem(project="thesis")
fs_prefix = "gs://"

In [None]:
features_classical_size = [
    'TRADE_PRICE', 'bid_ask_size_ratio_ex', 'rel_bid_size_ex',
       'rel_ask_size_ex', 'TRADE_SIZE', 'bid_size_ex', 'ask_size_ex',
       'rel_ask_ex', 'rel_bid_ex', 'BEST_rel_bid', 'BEST_rel_ask',
       'bid_ask_ratio_ex', 'chg_ex_lead', 'chg_ex_lag', 'chg_all_lead',
       'chg_all_lag', 'ask_ex', 'bid_ex', 'BEST_ASK', 'BEST_BID',
       'price_all_lag', 'price_all_lead', 'price_ex_lag', 'price_ex_lead'
]

In [None]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
# for refs

run = wandb.init(project="thesis",entity="fbv")

dataset = "fbv/thesis/classical_size_features_log_normalized:v0"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

model = "fbv/thesis/3lfsbuby_TabTransformer_default_trial_82.pth:v0"
artifact = run.use_artifact(model)
model_dir = artifact.download()

In [None]:
X_test = pd.read_parquet(Path(data_dir, "test_set_20.parquet"), engine="fastparquet")

y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]


In [None]:
X_test.head()

## TabTransformer Baseline 🦾

In [None]:
import sys

sys.path.append("..")
from otc.models.tabtransformer import TabTransformer
from otc.data.dataset import TabDataset
from otc.data.dataloader import TabDataLoader

In [None]:
# https://wandb.ai/fbv/thesis/runs/4fmccjm7/files/wandb-summary.json
# https://wandb.ai/fbv/thesis/artifacts/model/3lfsbuby_TabTransformer_default_trial_82.pth/3a1937a3e6ec748d45a3/metadata
params = {  "dim": 32,
  "depth": 3,
  "heads": 2,
  "weight_decay": 0.00835620489462654,
  "lr": 0.0015514372468568292,
  "dropout": 0.1,
  "batch_size": 32768}

training_data = TabDataset(X_test, y_test, [], [])

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")        

# differentiate between continous features only and mixed.
test_loader = TabDataLoader(
training_data.x_cat, training_data.x_cont, training_data.y, batch_size=params['batch_size'], device=device
)

       
model = TabTransformer(
            cat_cardinalities=[],
            num_continuous=len(features_classical_size),
            dim_out=1,
            mlp_act=nn.ReLU,
            dim=params["dim"],
            depth=params["depth"],
            heads=params["heads"],
            attn_dropout=params["dropout"],
            ff_dropout=params["dropout"],
            mlp_hidden_mults=(4, 2),
        ).to(device)


model.load_state_dict(torch.load(Path(model_dir,"3lfsbuby_TabTransformer_default_trial_82.pth")))
model.eval()

y_pred, y_true = [], []

for x_cat, x_cont, targets in test_loader:
  output = model(x_cat, x_cont)

  # map between zero and one, sigmoid is otherwise included in loss already
  # https://stackoverflow.com/a/66910866/5755604
  output = torch.sigmoid(output.squeeze())
  y_pred.append(output.detach().cpu().numpy())
  y_true.append(targets.detach().cpu().numpy())  # type: ignore

# round prediction to nearest int
y_pred = np.rint(np.concatenate(y_pred))
y_true = np.concatenate(y_true)

# map zeros back to -1
y_pred[y_pred == 0] = -1
y_true[y_true == 0] = -1

In [None]:
accuracy_score(y_true, y_pred)

In [None]:
# load default data to use unscaled version with all possible columns
test_orig = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet",
    engine="fastparquet",
)

In [None]:
# Copy unscaled columns
X_print = test_orig.copy()

# add baseline results
X_print["rule"] = "Baseline"
X_print["buy_sell_predicted"] = y_pred

# prepare columns for printing
X_print["ttm"] = (
    X_print["EXPIRATION"].dt.to_period("M")
    - X_print["QUOTE_DATETIME"].dt.to_period("M")
).apply(lambda x: x.n)
X_print["year"] = X_print["QUOTE_DATETIME"].dt.year

bins_tradesize = [-np.inf, 1, 3, 5, 11, np.inf]
trade_size_labels = ["(0,1]", "(1,3]", "(3,5]", "(5,11]", ">11"]
X_print["TRADE_SIZE_binned"] = pd.cut(
    X_print["TRADE_SIZE"], bins_tradesize, labels=trade_size_labels
)

bins_years = [2004, 2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
year_labels = [
    "2005-2007",
    "2008-2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
]
X_print["year_binned"] = pd.cut(X_print["year"], bins_years, labels=year_labels)

bins_ttm = [-np.inf, 1, 2, 3, 6, 12, np.inf]
ttm_labels = [
    "ttm <= 1 month",
    "ttm (1-2] month",
    "ttm (2-3] month",
    "ttm (3-6] month",
    "ttm (6-12] month",
    "ttm > 12 month",
]
X_print["ttm_binned"] = pd.cut(X_print["ttm"], bins_ttm, labels=ttm_labels)


bins_myn = [-np.inf, 0.7, 0.9, 1.1, 1.3, np.inf]
myn_labels = [
    "mny <=0.7",
    "mny (0.7-0.9]",
    "mny (0.9-1.1]",
    "mny (1.1-1.3]",
    "mny > 1.3",
]
X_print["myn_binned"] = pd.cut(X_print["myn"], bins_myn, labels=myn_labels)

X_print["issue_type_binned"] = X_print["issue_type"].replace(
    {"0": 'Stock options', 'A': 'Index options', '7': 'Others',
     'F': 'Others', '%': 'Others', ' ': 'Others'})


# TODO: time from previous trade; same underlying or any?


In [None]:
def check_robustness(criterion: str = "year_binned") -> pd.DataFrame:
    """
    Check robustness of rules by calculating the accuracy for a given
    criterion and rules.

    Example:
    rule		Baseline
    TRADE_SIZE_binned
    (0,1]	  0.710966
    (1,3]	  0.717664
    (3,5]	  0.715195
    (5,11]	0.699428
    >11	  	0.688348

    Args:
        criterion (str, optional): criterion to check robustness for.
        Defaults to "year_binned".

    Returns:
        pd.DataFrame: DataFrame with accuracy of rules. Rule in columns and
        criterion values in rows.
    """

    # fill others randomly with equal weight for every class.
    X_print["buy_sell_predicted"] = X_print["buy_sell_predicted"].map(
        lambda l: l if not np.isnan(l) else np.random.choice([-1, 1])
    )

    # cuculate average over columns if multiple subsets are combined
    results = (
        X_print.groupby(["rule", criterion])[["buy_sell", "buy_sell_predicted"]]
        .apply(lambda x: accuracy_score(x["buy_sell"], x["buy_sell_predicted"]))
        .unstack(level=0)
        .assign(avg=lambda x: x.mean(axis=1))
    )
    return results


In [None]:
check_robustness("year_binned")


In [None]:
check_robustness("OPTION_TYPE")


In [None]:
check_robustness("issue_type_binned")

In [None]:
check_robustness("TRADE_SIZE_binned")


In [None]:
check_robustness("ttm_binned")


In [None]:
check_robustness("myn_binned")


## Shap attributions

In [None]:
import shap

In [None]:
class TabModel:
    
    def __init__(self,model):
        self._model = model
        
    
    def predict(self,X:np.ndarray):
        

        
        # TODO: infer correct cat columns
        X = pd.DataFrame(X)
        y = pd.Series(range(len(X)))
        test_data = TabDataset(X, y, [], [])

        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")        

        # differentiate between continous features only and mixed.
        sample_loader = TabDataLoader(
        test_data.x_cat, test_data.x_cont, batch_size=params['batch_size'], device=device
        )

        y_pred= []

        for x_cat, x_cont in sample_loader:
          output = self._model(x_cat, x_cont)

          # map between zero and one, sigmoid is otherwise included in loss already
          # https://stackoverflow.com/a/66910866/5755604
          output = torch.sigmoid(output.squeeze())
          y_pred.append(output.detach().cpu().numpy())

        # get probabilities
        return y_pred[0].reshape(-1,1)

In [None]:
def get_probilistic_predictions(model, X):
    #X_sample = X_test.sample(n=50)

    tabmodel = TabModel(model)
    return tabmodel.predict(X)
    

In [None]:
f = lambda x: get_probilistic_predictions(model, x)
X_sample = X_test.sample(n=50)

kernelshap = shap.KernelExplainer(f, shap.sample(X_test, 52));
shap_values = kernelshap.shap_values(X_sample, nsamples=256);  # nsamples = no. of feature coalitions
#print(shap_values.shape, shap_values.dtype)

shap_values

In [None]:
shap_values = pd.DataFrame(shap_values[0], columns=X_test.columns)

In [None]:
import matplotlib.pyplot as plt

In [None]:
attrs_abs = np.abs(shap_values)
attrs_abs -= np.min(attrs_abs)
attrs_abs /= np.max(attrs_abs)
plt.ioff()
plt.matshow(attrs_abs)
plt.xticks(np.arange(len(X_test.columns)), X_test.columns, rotation=90)
plt.show()