In [None]:
import pandas as pd
import numpy as np
import shap
import itertools
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [None]:
RANDOM_SEED = 1
MAX_DEPTHS = [2,4,8,16]
N_ESTIMATORS = [16,32,64,128,256]

np.random.seed(RANDOM_SEED)

In [None]:
df = pd.read_csv("../data/adult_whole_data.csv")

In [None]:
df_train = df[df.set == "train"]
df_train = df_train.drop("set", axis=1)

df_test = df[df.set == "test"]
df_test = df_test.drop("set", axis=1)

del df

### Select the best parameters display

In [None]:
# I'll consider dividing the training set into training and validation.
# The final model will be trained on the entire training set and tested on test set
Y = df_train.target.values
X = df_train.drop("target", axis=1)

data = []
for max_depth, n_estimators in itertools.product(MAX_DEPTHS, N_ESTIMATORS):
    kf = KFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        model = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
        model.fit(X.iloc[train_idx], Y[train_idx])

        preds = model.predict(X.iloc[valid_idx])
        preds_proba = model.predict_proba(X.iloc[valid_idx])[:, 1:]

        f1 = f1_score(Y[valid_idx], preds)
        auc = roc_auc_score(Y[valid_idx], preds_proba)

        data.append([max_depth, n_estimators, fold, f1, auc])
           
df_results = pd.DataFrame(data, columns=["max_depth", "n_estimators", "fold", "f1_score", "auc"])

In [None]:
df_results = (
    df_results.groupby(["max_depth", "n_estimators"])
    .agg(
        mean_f1_score=("f1_score", "mean"),
        mean_auc=("auc", "mean"),
    )
    .reset_index()
)

In [None]:
df_results

In [None]:
fig = go.Figure(
    data=go.Heatmap(
        x=df_results.max_depth,
        y=df_results.n_estimators,
        z=df_results.mean_f1_score,
        colorbar={
            "title":"Mean F1-Score"
        }
    )
)

fig.update_layout(
    title="Mean F1-Score for Each Parameter Combination",
    xaxis_title="Maximal Depth",
    yaxis_title="Number of Estimators",
)

fig.update_xaxes(type='category')
fig.update_yaxes(type='category')

fig.show()

In [None]:
fig = go.Figure(
    data=go.Heatmap(
        x=df_results.max_depth,
        y=df_results.n_estimators,
        z=df_results.mean_auc,
        colorbar={
            "title":"Mean F1-Score"
        }
    )
)

fig.update_layout(
    title="Mean AUC for Each Parameter Combination",
    xaxis_title="Maximal Depth",
    yaxis_title="Number of Estimators",
)

fig.update_xaxes(type='category')
fig.update_yaxes(type='category')

fig.show()

### Train and Test the Best Model

In [None]:
Y_test = df_test.target.values
X_test = df_test.drop("target", axis=1)

In [None]:
X = df_train.drop("target", axis=1)

In [None]:
model = XGBClassifier(max_depth=4, n_estimators=128)
model.fit(X, Y)

preds = model.predict(X_test)
preds_proba = model.predict_proba(X_test)[:, 1:]

In [None]:
f1_score(Y_test, preds), roc_auc_score(Y_test, preds_proba)

In [None]:
df_male = df_test[df_test.sex == 1]
df_female = df_test[df_test.sex == 0]

male_f1 = f1_score(df_male.target, model.predict(df_male.drop("target", axis=1)))
female_f1 = f1_score(df_female.target, model.predict(df_female.drop("target", axis=1)))

print(f"Performance on Male Group: {male_f1}")
print(f"Performance on Female Group: {female_f1}")

### Get the SHAP scores

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
pd.DataFrame(shap_values, columns=X_test.columns).to_csv("../data/shap_test_adult.csv")