In [None]:
import seaborn as sns
import shap
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from scipy.stats import kendalltau, pearsonr
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

sys.path.insert(1, '/home/guilherme-resende/Desktop/mono2/utils')
import qif

In [None]:
sns.set()

In [None]:
RANDOM_SEED = 1
np.random.seed(RANDOM_SEED)

In [None]:
df = pd.read_csv("../data/synthetic_dataset_display_0.csv")

In [None]:
df.tail(3)

Divides synthetic data into train and test

In [None]:
train_idx = df.sample(frac=0.8).index
test_idx = df[~df.index.isin(train_idx)].index

X_train = df.iloc[train_idx].drop("target", axis=1)
Y_train = df.loc[train_idx, "target"]

X_test = df.iloc[test_idx].drop("target", axis=1)
Y_test = df.loc[test_idx, "target"]

In [None]:
df_test = df.iloc[test_idx]

In [None]:
xgb = XGBClassifier()

Fit and Predict with the model

In [None]:
xgb.fit(X_train, Y_train)
preds = xgb.predict(X_test)
preds_proba = xgb.predict_proba(X_test)[:, 1:]

df_test.loc[:, "preds_proba"] = preds_proba.reshape(-1)

As expected, the model was always able to predict the correct label.

In [None]:
print(f"F1_Score is {f1_score(Y_test, preds)}, whereas AUC is {roc_auc_score(Y_test, preds_proba)}")

### QIF

In [None]:
# Discretize the probabilities
df_test["preds_proba"] = (df_test["preds_proba"] * 100).astype(int)

In [None]:
bayes_leakage = qif.BayesLeakage(df_test)
feature_names = X_test.columns

In [None]:
qif_values = []
for feature in feature_names:
    leakage = bayes_leakage.compute_flows(x=feature, y='preds_proba')
    qif_values.append(max(leakage))

qif_values = np.array(qif_values)
qif_values = qif_values / qif_values.sum() # Normalize

That's a shit

In [None]:
figure, ax = plt.subplots(figsize=(9,7))

plt.barh(feature_names, qif_values)
plt.title("Feature Importance", fontsize=16)
plt.ylabel("Features Names", fontsize=14)
plt.xlabel("Normalized QIF Values", fontsize=14)
plt.yticks(["f0", "f1", "f2", "f3", "f4"], ["F0", "F1", "F2", "F3", "F4"])

plt.show()

### SHAP

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test.values)

In [None]:
shap_values

In [None]:
figure, ax = plt.subplots(figsize=(9,7))

plt.boxplot(shap_values)
plt.title("Feature Importance Distribution", fontsize=16)
plt.xlabel("Features", fontsize=14)
plt.ylabel("SHAP Values", fontsize=14)
plt.xticks([1, 2, 3, 4, 5], ["F0", "F1", "F2", "F3", "F4"])
plt.show()

In [None]:
shap_values = shap_values.mean(axis=0)

In [None]:
k_coef, _ = kendalltau(shap_values, qif_values)
p_coef, _ = pearsonr(shap_values, qif_values)

In [None]:
print("Kendall:", k_coef)
print("Pearson:", p_coef)