In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import shap
import itertools
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import yaml
import sys

from scipy.stats import kendalltau, pearsonr
from sklearn.metrics import jaccard_similarity_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

sys.path.insert(1, '/home/guilherme-resende/Desktop/mono2/utils')
import qif
from categorize import Categorize

In [None]:
sns.set()

In [None]:
RANDOM_SEED = 1
np.random.seed(RANDOM_SEED)

#### Load Data

In [None]:
ds_name = "adult"

In [None]:
datasets = yaml.load(open("datasets.yaml"))
df = pd.read_csv(datasets[ds_name]["path"])

pre_process = Categorize(
    df,
    binary_cols=datasets[ds_name]["binary_cols"],
    hierarchical_continuous_cols=datasets[ds_name]["hierarchical_continuous_cols"],
    non_hierarchical_cols=datasets[ds_name]["non_hierarchical_cols"]
)

df = pre_process.transform_data()

df_train = df.loc[df.set == "train"].drop("set", axis=1)
df_test = df.loc[df.set == "test"].drop("set", axis=1)

del df

In [None]:
# I'll consider dividing the training set into training and validation.
# The final model will be trained on the entire training set and tested on test set

X = df_train.drop("targets", axis=1)
Y = df_train.targets

X_test = df_test.drop("targets", axis=1)
Y_test = df_test.targets.values

#### Train XGBoost

In [None]:
xgb = XGBClassifier(max_depth=8, n_estimators=128)
xgb.fit(X.values, Y)

In [None]:
preds = xgb.predict(X_test.values)
preds_proba = xgb.predict_proba(X_test.values)[:, 1:]
df_test["preds_proba"] = preds_proba

print("Metrics:")
print("\tF1-Score ->", f1_score(Y_test, preds))
print("\tAUC ->", roc_auc_score(Y_test, preds_proba))

#### Get the SHAP scores

In [None]:
%%time

explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test.values)

shap_values = shap_values.mean(axis=0)

#### Get the QIF scores

In [None]:
# Discretize the probabilities
df_test["preds_proba"] = (df_test["preds_proba"] * 100).astype(int)

In [None]:
bayes_leakage = qif.BayesLeakage(df_test)
feature_names = X_test.columns

In [None]:
%%time

qif_values = []
for feature in feature_names:
    leakage = bayes_leakage.compute_flows(x=feature, y='preds_proba')
    qif_values.append(max(leakage))

qif_values = np.array(qif_values)
qif_values = qif_values / qif_values.sum() # Normalize

In [None]:
qif_values

In [None]:
feat_coefs_qif = {name: coef for name, coef in zip(feature_names, qif_values)}

#### Train a Linear Model

In [None]:
logit = LogisticRegression()
logit.fit(X.values, Y)

In [None]:
preds = logit.predict(X_test.values)
preds_proba = logit.predict_proba(X_test.values)[:, 1:]
df_test["preds_proba"] = preds_proba

print("Metrics:")
print("\tF1-Score ->", f1_score(Y_test, preds))
print("\tAUC ->", roc_auc_score(Y_test, preds_proba))

In [None]:
logit_values = vars(logit)["coef_"].reshape(-1)

In [None]:
logit_values

In [None]:
feat_coefs_logit = {name: coef for name, coef in zip(feature_names, logit_values)}

#### Comparison Between Importances

In [None]:
feat_coefs_logit = dict(
    sorted(
        feat_coefs_logit.items(),
        key=lambda item: abs(item[1]),
        reverse=True
    )
)

feat_coefs_qif = dict(
    sorted(
        feat_coefs_qif.items(),
        key=lambda item: abs(item[1]),
        reverse=True
    )
)

In [None]:
jaccard_at_k = []

for k in range(2, len(feat_coefs_logit)):
    jaccard_at_k.append(
        jaccard_similarity_score(
            list(feat_coefs_logit.keys())[:k],
            list(feat_coefs_qif.keys())[:k]
        )
    )

jaccard_at_k = np.round(jaccard_at_k, 3)

In [None]:
x = range(2, len(feat_coefs_logit))

figure, ax = plt.subplots(figsize=(12,7))

plt.plot(x, jaccard_at_k, label="Real")
plt.plot(x, np.array(x)/len(feat_coefs_logit), linestyle="--", label="Ideal")
plt.title("Jaccard Score @ K", fontsize=16)
plt.xlabel("K", fontsize=14)
plt.ylabel("Jaccard Score", fontsize=14)
plt.legend(fontsize=12)
plt.show()

In [None]:
intersection_at_k = []

for k in range(2, len(feat_coefs_logit)):
    intersection_at_k.append(
        len(
            set(
                list(feat_coefs_logit.keys())[:k]
            ).intersection(list(feat_coefs_qif.keys())[:k])
        )
    )

figure, ax = plt.subplots(figsize=(12,7))

plt.plot(range(2, len(feat_coefs_logit)), intersection_at_k, label="Real")
plt.plot(range(2, len(feat_coefs_logit)), range(2, len(feat_coefs_logit)), linestyle="--", label="Ideal")
plt.title("Intersection @ K", fontsize=16)
plt.xlabel("K", fontsize=14)
plt.ylabel("Intersection Length", fontsize=14)
plt.legend(fontsize=12)
plt.show()

In [None]:
k_coef, _ = kendalltau(qif_values, logit_values)
p_coef, _ = pearsonr(qif_values, logit_values)

In [None]:
print("Kendall:", k_coef)
print("Pearson:", p_coef)