In [1]:
import pandas as pd
import numpy as np
import shap
import itertools
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import yaml
import sys

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

sys.path.insert(1, '/home/guilherme-resende/Desktop/mono2/utils')
import qif
from categorize import Categorize

In [2]:
RANDOM_SEED = 1
MAX_DEPTHS = [2,4,8,16]
N_ESTIMATORS = [16,32,64,128,256]

np.random.seed(RANDOM_SEED)

#### Load Data

In [3]:
ds_name = "adult"

In [4]:
datasets = yaml.load(open("datasets.yaml"))
df = pd.read_csv(datasets[ds_name]["path"])

pre_process = Categorize(
    df,
    datasets[ds_name]["binary_cols"],
    datasets[ds_name]["hierarchical_continuous_cols"],
    datasets[ds_name]["non_hierarchical_cols"]
)

df = pre_process.transform_data()

df_train = df.loc[df.set == "train"].drop("set", axis=1)
df_test = df.loc[df.set == "test"].drop("set", axis=1)

del df


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [5]:
# I'll consider dividing the training set into training and validation.
# The final model will be trained on the entire training set and tested on test set

X = df_train.drop("target", axis=1)
Y = df_train.target.values

#### Select the best parameters combination

In [None]:
data = []
for max_depth, n_estimators in itertools.product(MAX_DEPTHS, N_ESTIMATORS):
    kf = KFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        model = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
        model.fit(X.iloc[train_idx].values, Y[train_idx])

        preds = model.predict(X.iloc[valid_idx].values)
        preds_proba = model.predict_proba(X.iloc[valid_idx].values)[:, 1:]

        f1 = f1_score(Y[valid_idx], preds)
        auc = roc_auc_score(Y[valid_idx], preds_proba)

        data.append([max_depth, n_estimators, fold, f1, auc])
           
df_results = pd.DataFrame(data, columns=["max_depth", "n_estimators", "fold", "f1_score", "auc"])

In [None]:
df_results = (
    df_results.groupby(["max_depth", "n_estimators"])
    .agg(
        mean_f1_score=("f1_score", "mean"),
        mean_auc=("auc", "mean"),
    )
    .reset_index()
)

In [None]:
fig = go.Figure(
    data=go.Heatmap(
        x=df_results.max_depth,
        y=df_results.n_estimators,
        z=df_results.mean_f1_score,
        colorbar={
            "title":"Mean F1-Score"
        }
    )
)

fig.update_layout(
    title="Mean F1-Score for Each Parameter Combination",
    xaxis_title="Maximal Depth",
    yaxis_title="Number of Estimators",
)

fig.update_xaxes(type='category')
fig.update_yaxes(type='category')

fig.show()

In [None]:
fig = go.Figure(
    data=go.Heatmap(
        x=df_results.max_depth,
        y=df_results.n_estimators,
        z=df_results.mean_auc,
        colorbar={
            "title":"Mean F1-Score"
        }
    )
)

fig.update_layout(
    title="Mean AUC for Each Parameter Combination",
    xaxis_title="Maximal Depth",
    yaxis_title="Number of Estimators",
)

fig.update_xaxes(type='category')
fig.update_yaxes(type='category')

fig.show()

#### Train and Test the Best Model

In [None]:
X_test = df_test.drop("target", axis=1)
Y_test = df_test.target.values

In [None]:
model = XGBClassifier(max_depth=4, n_estimators=128)
model.fit(X.values, Y)

In [None]:
preds = model.predict(X_test.values)
preds_proba = model.predict_proba(X_test.values)[:, 1:]
df_test["preds_proba"] = preds_proba

In [None]:
print("Metrics:")
print("\tF1-Score ->", f1_score(Y_test, preds))
print("\tAUC ->", roc_auc_score(Y_test, preds_proba))

#### Get the SHAP scores

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test.values)

shap_values = shap_values.mean(axis=0)

#### Get the QIF scores

In [None]:
bayes_leakage = qif.BayesLeakage(df_test)

In [None]:
feature_names = X_test.columns

In [None]:
qif_values = []
for feature in feature_names:
    leakage = bayes_leakage.compute_flows(x=feature, y='preds_proba')
    qif_values.append(leakage[0])

qif_values = np.array(qif_values)
qif_values = qif_values / qif_values.sum() # Normalize

#### Save Coeficients

In [None]:
df = pd.DataFrame([shap_values, qif_values], columns=feature_names)
df["method"] = ["SHAP", "QIF"]

df.to_csv(f"../data/results/{ds_name}_coeficients.csv")