In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
# import shap
import sklearn
# import torch
#from catboost import CatBoostClassifier, Pool
# from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
from torch import nn

sys.path.append("..")
from otc.models.classical_classifier import ClassicalClassifier
# from otc.models.fttransformer import FTTransformer
# from otc.models.tabtransformer import TabTransformer
# from otc.models.transformer_classifier import TransformerClassifier


from otc.features.build_features import (
    features_categorical,
    features_classical,
    features_classical_size,
    features_ml,
)
# shap.initjs()

import wandb
from tqdm.auto import tqdm

In [None]:
SEED = 42

# set globally here
EXCHANGE = "ise"  
STRATEGY = "supervised"  
SUBSET = "test"  

# Change mode depending on model!
MODE = "none" # "log_standardized"
# Change depending on model!
FEATURES = features_classical_size

In [None]:
# key used for files and artefacts
dataset = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_{MODE}:latest"

In [None]:
# set project name. Required to access files and artefacts
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

In [None]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()

In [None]:
columns = [
    *FEATURES,
    "buy_sell",
]
data = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet", columns=columns)

y_test = data["buy_sell"]
X_test = data.drop(columns="buy_sell")

## Define dependency structure for permutation

In [None]:
# features classical
n1 = ['TRADE_PRICE', 'bid_ex', 'ask_ex', 'prox_ex']
n2 = ['TRADE_PRICE', 'BEST_ASK', 'BEST_BID', 'prox_best']
n3 = ['TRADE_PRICE', 'price_ex_lag', 'price_all_lag',  'chg_ex_lag' ,  'chg_all_lag' ]
n4 = ['TRADE_PRICE', 'price_ex_lead', 'price_all_lead', 'chg_ex_lead', 'chg_all_lead']

n12 = list(set(n1) | set(n2))
n34 = list(set(n3) | set(n4))
n_classical = list(set(n12) | set(n34))


# features size
n_size = ['TRADE_SIZE','ask_size_ex','bid_ask_size_ratio_ex',
      'bid_size_ex','depth_ex','rel_ask_size_ex','rel_bid_size_ex']

# ml features
n6 = ["issue_type"]
n7 = ["option_type"]
n8 = ["root"]
n9 = ["day_vol"]
n10 = ['TRADE_PRICE', "myn", 'STRK_PRC', "ttm"]

n_option = [*n6, *n7, *n8, *n9, *n10]

if FEATURES == features_classical:
    permutations = [n1, n2, n3, n4]
if FEATURES == features_classical_size:
    permutations = [n1, n2, n3, n4, n_classical, n_size]
if FEATURES == features_ml:
    permutations = [n1,n2,n3,n4,n_classical,n_size,n6,n7,n8,n9,n10,n_option]

print(permutations)

In [None]:
clf = ClassicalClassifier(layers=[("trade_size", "ex"), ("rev_lr", "best")], 
                                  random_state=SEED, strategy="random")
# fit is only used to set sklearn attributes, no leakage
clf.fit(X=X_test.head(5), y=y_test.head(5))

In [None]:
n_repeats = 2

base_acc = clf.score(X_test, y_test)

results = []
for permutation in tqdm(permutations):
    accuracies_iter = [base_acc]
    indices_iter = ["base"]
    # similar to Fisher et. al permute multiple times
    for i in tqdm(range(n_repeats)):
        
        # generate random permutation
        np.random.seed(i) 
        permuted_indices = np.random.permutation(len(X_test))  
        X_test_perm = X_test.copy()
        y_test_perm = y_test.copy()
        
        # permute relevant columns
        X_test_perm[permutation] = X_test_perm[permutation].values[permuted_indices]      
        perm_acc = clf.score(X_test_perm, y_test)
        
        # store raw scores to estimate uncertainties etc. Calculate change later.
        accuracies_iter.append(perm_acc)
        indices_iter.append(f"iter-{i}")
        
    results.append(pd.DataFrame(accuracies_iter, index = indices_iter))

In [None]:
keys = ["/".join(p) for p in permutations]

joint_results = pd.concat(results, axis=1, keys=keys)
joint_results.T

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    *shap.datasets.iris(), test_size=0.2, random_state=0
)
shap.initjs()

model = CatBoostClassifier()
model.fit(X_train, y_train)
print(accuracy_score(y_test, model.predict(X_test)))
print(model.predict_proba(X_test))


# shap values with kernel explainer
explainer = shap.KernelExplainer(model.predict_proba, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[0], X_test, plot_type="bar")


In [None]:
# shap values with tree explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[0], X_test, plot_type="bar")


In [None]:
# see https://catboost.ai/en/docs/concepts/shap-values
shap_values = model.get_feature_importance(data=Pool(X_test, y_test), type="ShapValues")
# shape (observations, features + 1 * expected_value)shap_values = model.get_feature_importance(data=Pool(X_test, y_test), type="ShapValues")
shap.summary_plot(shap_values[:, 0, :-1], X_test, plot_type="bar")


In [None]:
# similar to random feature permutation
# https://catboost.ai/en/docs/concepts/fstr#regular-feature-importance
model.get_feature_importance(
    data=Pool(X_test, y_test), type="FeatureImportance", prettified=True
)


In [None]:
# random feature permutation sklearn
r = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=0)
# results are average; obviously not normalized to one.
for i in r.importances_mean.argsort()[::-1]:
    print(
        f"{X_train.columns[i]}"
        f"{r.importances_mean[i]:.3f}"
        f" +/- {r.importances_std[i]:.3f}"
    )


## Attention Maps for Transformers

We calculate the average attention map from all transformer blocks, as done in the Gorishniy paper (see [here](https://github.com/Yura52/tabular-dl-revisiting-models/issues/2)). This is different from the Borisov paper (see [here](https://github.com/kathrinse/TabSurvey/blob/main/models/basemodel_torch.py)).

In [None]:
import sys
from typing import List

import pandas as pd
import seaborn as sns
import scipy.stats
import torch
from torch import nn

sys.path.append("..")
from otc.models.tabtransformer import TabTransformer


In [None]:
num_features_cont = 5
num_features_cat = 3
num_unique_cat = tuple([2, 2, 2])
batch_size = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_cat = torch.randint(0, 1, (batch_size, num_features_cat)).to(device)
x_cont = torch.randn(batch_size, num_features_cont).float().to(device)
expected_outputs = torch.randint(0, 1, (batch_size, 1)).float().to(device)

model = TabTransformer(
    cat_cardinalities=num_unique_cat,
    num_continuous=num_features_cont,
    dim_out=1,
    mlp_act=nn.ReLU,
    dim=32,
    depth=2,
    heads=6,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4, 2),
).to(device)


In [None]:
class SaveAttentionMaps:
    """
    Hook for attention maps.

    Inspired by:
    https://github.com/Yura52/tabular-dl-revisiting-models/issues/2#issuecomment-1068123629
    """

    def __init__(self):
        self.attention_maps: List[torch.Tensor] = []

    def __call__(self, _, __, output):
        print(output[1]["attention_probs"].shape)
        self.attention_maps.append(output[1]["attention_probs"])


In [None]:
# The following hook will save all attention maps from all attention modules.
hook = SaveAttentionMaps()
for block in model.transformer.blocks:
    block.attention.fn.fn.register_forward_hook(hook)

# Apply the model to all objects.
model.eval()
with torch.inference_mode():
    model(x_cat.clone(), x_cont.clone())

# Collect attention maps
n_objects = len(x_cat)
n_blocks = len(model.transformer.blocks)
n_heads = model.transformer.blocks[0].attention.fn.fn.n_heads

attention_maps = torch.cat(hook.attention_maps)

# Calculate feature importance and ranks.
attention_maps = attention_maps.reshape(
    n_objects * n_blocks * n_heads, num_features_cat, num_features_cat
)
assert attention_maps.shape == (
    n_objects * n_blocks * n_heads,
    num_features_cat,
    num_features_cat,
)

# Calculate feature importance and ranks.
average_attention_map = attention_maps.mean(0)
feature_importance = average_attention_map[-1]

feature_importance = feature_importance.cpu().numpy()
feature_ranks = scipy.stats.rankdata(-feature_importance)
feature_indices_sorted_by_importance = feature_importance.argsort()[::-1]

print(feature_importance)
print(feature_ranks)
print(feature_indices_sorted_by_importance)


In [None]:
ax = sns.barplot(x=feature_importance, y=["f1", "f2", "f3"])
ax.set(xlim=(0, 1))


In [None]:
from otc.models.activation import ReGLU
from otc.models.fttransformer import (
    CategoricalFeatureTokenizer,
    CLSToken,
    FeatureTokenizer,
    FTTransformer,
    MultiheadAttention,
    NumericalFeatureTokenizer,
    Transformer,
)

num_features_cont = 5
num_features_cat = 1
cat_cardinalities = [2]
batch_size = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_cat = torch.randint(0, 1, (batch_size, num_features_cat)).to(device)
x_cont = torch.randn(batch_size, num_features_cont).float().to(device)
expected_outputs = torch.randint(0, 1, (batch_size, 1)).float().to(device)

params_feature_tokenizer = {
    "num_continous": num_features_cont,
    "cat_cardinalities": cat_cardinalities,
    "d_token": 96,
}
feature_tokenizer = FeatureTokenizer(**params_feature_tokenizer)
params_transformer = {
    "d_token": 96,
    "n_blocks": 3,
    "attention_n_heads": 8,
    "attention_initialization": "kaiming",
    "ffn_activation": ReGLU,
    "attention_normalization": nn.LayerNorm,
    "ffn_normalization": nn.LayerNorm,
    "ffn_dropout": 0.1,
    "ffn_d_hidden": 96 * 2,
    "attention_dropout": 0.1,
    "residual_dropout": 0.1,
    "prenormalization": True,
    "first_prenormalization": False,
    "last_layer_query_idx": None,
    "n_tokens": None,
    "kv_compression_ratio": None,
    "kv_compression_sharing": None,
    "head_activation": nn.ReLU,
    "head_normalization": nn.LayerNorm,
    "d_out": 1,
}

transformer = Transformer(**params_transformer)

model = FTTransformer(feature_tokenizer, transformer).to(device)


In [None]:
# Prepare data and model.
n_objects = len(x_cat)  # 12
n_features = num_features_cont + num_features_cat

# The following hook will save all attention maps from all attention modules.
hook = SaveAttentionMaps()
for block in model.transformer.blocks:
    block.attention.register_forward_hook(hook)

# Apply the model to all objects.
model.eval()
with torch.inference_mode():
    model(x_cat, x_cont)

# Collect attention maps
n_blocks = len(model.transformer.blocks)
n_heads = model.transformer.blocks[0].attention.n_heads
n_tokens = n_features + 1
attention_maps = torch.cat(hook.attention_maps)
assert attention_maps.shape == (n_objects * n_blocks * n_heads, n_tokens, n_tokens)

# Calculate feature importance and ranks.
average_attention_map = attention_maps.mean(0)
average_cls_attention_map = average_attention_map[-1]  # consider only the [CLS] token
feature_importance = average_cls_attention_map[:-1]  # drop the [CLS] token importance
assert feature_importance.shape == (n_features,)

feature_importance = feature_importance.cpu().numpy()
feature_ranks = scipy.stats.rankdata(-feature_importance)
feature_indices_sorted_by_importance = feature_importance.argsort()[::-1]

print(feature_importance)
print(feature_ranks)
print(feature_indices_sorted_by_importance)


In [None]:
ax = sns.barplot(x=feature_importance, y=["f1", "f2", "f3", "f4", "f5", "f6"])
ax.set(xlim=(0, 1))
