In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import shap
from functools import partial
from itertools import combinations
import math
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [3]:
import decision_infovalue
import importlib
importlib.reload(decision_infovalue)

<module 'decision_infovalue' from '/Users/guoziyang/Documents/Documents - GuoZiYang的MacBook Pro/info-value-toolkit/decision_infovalue/__init__.py'>

# Data preparation

In [4]:
data, metadata = decision_infovalue.get_dataset("recidivism")
x_columns = ["sex", "age", "race", "juv_fel_count", "juv_misd_count", "priors_count", 
                        "c_charge_degree", "juv_other_count", "decile_score"]
columns = x_columns + ["two_year_recid"]
data_selected = data[columns]

# Handle missing values by dropping rows with missing data in the selected columns
# data_selected = data_selected.dropna()

# Encode categorical features using LabelEncoder
categorical_columns = ["sex", "race", "c_charge_degree"]
encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    data_selected.loc[:, col] = encoders[col].fit_transform(data_selected.loc[:, col])

X = data_selected[x_columns].astype("float64")
y = data_selected["two_year_recid"].astype(int)

# Human prediction

In [5]:
human_pred = data["predicted_decision"] > 50
h_accuracy = accuracy_score(y, human_pred)
h_report = classification_report(y, human_pred)


print(f"Human Accuracy: {h_accuracy:.4f}")
print("Human Classification Report:")
print(h_report)

Human Accuracy: 0.5662
Human Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.51      0.56      4914
           1       0.53      0.63      0.58      4355

    accuracy                           0.57      9269
   macro avg       0.57      0.57      0.57      9269
weighted avg       0.57      0.57      0.57      9269



# Train an XGB model to classify recidivism

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"# of training data: {len(X_train)}, testing data: {len(X_test)}")

# of training data: 6488, testing data: 2781


In [7]:

def train_xgboost_model(X_train, X_test, y_train, y_test):
    
    model = XGBClassifier(
        eval_metric="logloss",
        random_state=42
    )
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC: {auc:.4f}")
    return model


In [8]:
model = train_xgboost_model(X_train, X_test, y_train, y_test)

Accuracy: 0.9669
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1453
           1       0.97      0.96      0.97      1328

    accuracy                           0.97      2781
   macro avg       0.97      0.97      0.97      2781
weighted avg       0.97      0.97      0.97      2781

AUC: 0.9945


# SHAP explanation

In [9]:
# # Decide the order you want for the plot
feature_names_ordered_for_plot = ["priors_count", "age", "decile_score", "sex", "race", 
                                    "c_charge_degree", "juv_other_count", "juv_misd_count", "juv_fel_count"]

In [10]:
explainer = shap.PermutationExplainer(model.predict, X_train)
shap_values = explainer(X_test)

# Order the features_df and the shap-values
features_df_ordered_for_plot = X_test[feature_names_ordered_for_plot]
shap_values_ordered_for_plot = pd.DataFrame(shap_values.values, columns=X_test.columns)[feature_names_ordered_for_plot].to_numpy()

# # Plot
shap.summary_plot(
    shap_values_ordered_for_plot,
    features_df_ordered_for_plot,
    sort=False,
    show=False
)

# # shap.summary_plot(shap_values, X_test) #, show = False)
plt.savefig("notebooks/recidivism-example/explanations/SHAP_summary.pdf", bbox_inches="tight")
plt.clf()
for i in range(20):
    shap.waterfall_plot(shap_values[i], max_display=10, show = False)
    plt.savefig(f"notebooks/recidivism-example/explanations/SHAP_instance{i}.pdf", bbox_inches="tight")
    plt.clf()

PermutationExplainer explainer: 2782it [01:45, 24.05it/s]                          


<Figure size 800x600 with 0 Axes>

# Information-based explanation

In [118]:
data.loc[:, "ai_pred"] = model.predict_proba(X)[:, 1]
data.loc[:, "human_pred"] = data["predicted_decision"]/100
data

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,out_custody,priors_count.1,start,end,event,two_year_recid,individual_id,predicted_decision,ai_pred,human_pred
0,22,darrious davis,darrious,davis,2013-12-22,Male,1990-06-22,25,25 - 45,African-American,...,2015-05-31,3,0,463,0,0,22,42,0.136077,0.42
1,22,darrious davis,darrious,davis,2013-12-22,Male,1990-06-22,25,25 - 45,African-American,...,2015-05-31,3,0,463,0,0,22,48,0.136077,0.48
2,22,darrious davis,darrious,davis,2013-12-22,Male,1990-06-22,25,25 - 45,African-American,...,2015-05-31,3,0,463,0,0,22,98,0.136077,0.98
3,22,darrious davis,darrious,davis,2013-12-22,Male,1990-06-22,25,25 - 45,African-American,...,2015-05-31,3,0,463,0,0,22,35,0.136077,0.35
4,22,darrious davis,darrious,davis,2013-12-22,Male,1990-06-22,25,25 - 45,African-American,...,2015-05-31,3,0,463,0,0,22,95,0.136077,0.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9264,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,2014-02-02,0,1,790,0,0,10997,78,0.329590,0.78
9265,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,2014-02-02,0,1,790,0,0,10997,42,0.329590,0.42
9266,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,2014-02-02,0,1,790,0,0,10997,19,0.329590,0.19
9267,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,2014-02-02,0,1,790,0,0,10997,19,0.329590,0.19


In [124]:


info_val = decision_infovalue.DecisionInfoModel(data, "two_year_recid", 
                                     signals=["ai_pred", "human_pred"], 
                                     scoring_rule="brier",
                                     fit_test_ratio=0.8,
                                     overfit_tolerance=0.1)
def iliv_model(X, base_signal):
    y_prob = model.predict(X)
    return np.array([info_val.instanse_complement_info_value("ai_pred", base_signal, "ai_pred", p, base_signals=["human_pred"]) for p in y_prob])

In [129]:
from tqdm import tqdm

explainers = [shap.PermutationExplainer(partial(iliv_model, base_signal = model.predict_proba(X_test[i:i+1])[0, 1]), X_train) for i in range(len(X_test))]
shap_values = [explainers[i](X_test[i:i+1]) for i in tqdm(range(len(X_test)))]

features_df_ordered_for_plot = X_test[feature_names_ordered_for_plot]
shap_values_ordered_for_plot = pd.DataFrame(np.concatenate([shap_value.values for shap_value in shap_values], axis=0), columns=X_test.columns)[feature_names_ordered_for_plot].to_numpy()

# Plot
shap.summary_plot(
    shap_values_ordered_for_plot,
    features_df_ordered_for_plot,
    sort=False,
    show=False
)

plt.savefig("notebooks/recidivism-example/information_explanations/InfoModel_summary.pdf", bbox_inches="tight")
plt.clf()
for i in range(20):
    shap.waterfall_plot(shap_values[i][0], max_display=10, show = False)
    plt.savefig(f"notebooks/recidivism-example/information_explanations/InfoModel_instance{i}.pdf", bbox_inches="tight")
    plt.clf()

  2%|▏         | 55/2781 [16:17<13:27:35, 17.78s/it]


KeyboardInterrupt: 