## Import

In [21]:
# Data Librairies
import pandas as pd
import numpy as np

# Data Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import xgboost as xgb
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
# Data Vizualization Librairies
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings("ignore")

## Fonctions utilitaires

In [22]:
def getAgeCat(age: int) -> str:
    cat: str = ""

    if(age < 35):
        cat = "34 & -"
    elif (age >=35 and age < 45):
        cat = "35-44"
    elif (age >= 45 and age < 55):
        cat = "45-54"
    elif (age >= 55 and age < 65):
        cat = "55-64"
    else:
        cat = "65 & +"

    return cat

### Load Dataset

In [23]:
df = pd.read_csv("./Data/data_participant.csv", index_col=None)
df.describe()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default-payment-next-month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,24000.0
mean,15000.5,167484.322667,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.252333
std,8660.398374,129747.661567,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.434361
min,1.0,10000.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,1.0
max,30000.0,1000000.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [24]:
# Create "Remaining Amount" Column
df["RMN_AMT2"] = df["BILL_AMT2"] - df["PAY_AMT1"]
df["RMN_AMT3"] = df["BILL_AMT3"] - df["PAY_AMT2"]
df["RMN_AMT4"] = df["BILL_AMT4"] - df["PAY_AMT3"]
df["RMN_AMT5"] = df["BILL_AMT5"] - df["PAY_AMT4"]
df["RMN_AMT6"] = df["BILL_AMT6"] - df["PAY_AMT5"]

# Create "Sufficient Payment"
df["SUFFICIENT_PAYMENT_AMT2"] = np.where(df["RMN_AMT2"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT3"] = np.where(df["RMN_AMT3"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT4"] = np.where(df["RMN_AMT4"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT5"] = np.where(df["RMN_AMT5"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT6"] = np.where(df["RMN_AMT6"] <= 0, 1, 0)

# Add weight to most recent months
df["SUFFICIENT_PAYMENT_AMT2"] = df["SUFFICIENT_PAYMENT_AMT2"]*1.4
df["SUFFICIENT_PAYMENT_AMT3"] = df["SUFFICIENT_PAYMENT_AMT3"]*1.3
df["SUFFICIENT_PAYMENT_AMT4"] = df["SUFFICIENT_PAYMENT_AMT4"]*1.2
df["SUFFICIENT_PAYMENT_AMT5"] = df["SUFFICIENT_PAYMENT_AMT5"]*1.1
df["SUFFICIENT_PAYMENT_AMT6"] = df["SUFFICIENT_PAYMENT_AMT6"]*1

df.drop(['RMN_AMT2', 'RMN_AMT3', 'RMN_AMT4', 'RMN_AMT5', 'RMN_AMT6', 'AGE'], axis=1, inplace=True)

In [25]:
df.describe(exclude=np.number) # Describe Categorical Values

Unnamed: 0,SEX,EDUCATION,MARRIAGE
count,30000,30000,30000
unique,2,5,4
top,female,university,single
freq,18112,14030,15964


In [26]:
cats = df.select_dtypes(exclude=np.number).columns.tolist()
for cat in cats:
    df[cat] = df[cat].astype('category')

In [27]:
# Creating Target and Explicative Datasets
df_training = df[(~df['default-payment-next-month'].isna())]
df_prediction = df[(df['default-payment-next-month'].isna())]

X, y = df_training.drop(['default-payment-next-month'], axis=1), df_training['default-payment-next-month']

## Creating Train and Test Datasets

In [28]:
X.rename(columns={'default-payment-next-month': 'score'})
# Standard Sklearn Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
sensitive_train, sensitive_test = X_train[['SEX', 'MARRIAGE', 'EDUCATION']], X_test[['SEX', 'MARRIAGE', 'EDUCATION']]

# XGBoost Matrix
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [29]:
# Defining Hyperparameters
params = {
    "objective": "binary:logistic",
    # 'max_depth': 4,
    'learning_rate': 0.1
    # "num_class": 2
    # "tree_method": "gpu_hist"
}

In [30]:
# Defining and Training Model
n = 100
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [31]:
# RMSE Validation
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)

preds_xgb = np.where(preds < 0.5, 0, 1)
print(accuracy_score(y_test, preds_xgb))
print(f1_score(y_test, preds_xgb))

print(f"RMSE of the base model: {rmse:.3f}")

0.796
0.4506283662477558
RMSE of the base model: 0.393


In [32]:
preds_xgb = pd.Series(preds_xgb)
y_test_di1 = pd.DataFrame(y_test)
y_test_di1["pred"] = preds_xgb.values
df_out_group = pd.concat([X_test[['SEX', 'MARRIAGE', 'EDUCATION']], y_test_di1], axis=1)
df_out_group.rename(columns={'pred': 'score', 'default-payment-next-month': 'label_value'}, inplace=True)

df_out_bias = pd.concat([X_test, y_test_di1], axis=1)
df_out_bias.rename(columns={'pred': 'score', 'default-payment-next-month': 'label_value'}, inplace=True) 

In [33]:
g = Group()

df_out_group.dtypes
df_out_group['SEX'] = df_out_group['SEX'].astype('str')
df_out_group['MARRIAGE'] = df_out_group['MARRIAGE'].astype('str')
df_out_group['EDUCATION'] = df_out_group['EDUCATION'].astype('str')
xtab, _ = g.get_crosstabs(df_out_group)

b = Bias()

# Calculate DI
ref_groups_dict = {
    'SEX': 'female',
    'SEX': 'male',
    'MARRIAGE': 'married',
    'MARRIAGE': 'others',
    'MARRIAGE': 'single',
    'MARRIAGE': 'unknown',
    'EDUCATION': 'graduate school',
    'EDUCATION': 'high school',
    'EDUCATION': 'others',
    'EDUCATION': 'university',
    'EDUCATION': 'unknown',
}

bdf1 = b.get_disparity_predefined_groups(xtab, original_df=df_out_group, ref_groups_dict=ref_groups_dict)

# Calculate Disparate Impact (DI) manually and create a custom column
bdf1['disparity_index'] = 1 - abs(bdf1['fpr'] - 1)
bdf1['weight_adjustment'] = 1 - bdf1['disparity_index']

# Define the sensitive attribute categories
# Calculate the aggregate Normalized DI (simple average)
aggregate_normalized_di = bdf1['disparity_index'].sum() / len(bdf1['disparity_index'])

# Print the aggregate Normalized DI
print(f'Aggregate Normalized DI: {aggregate_normalized_di:.2f}')

get_disparity_predefined_group()
Aggregate Normalized DI: 0.04


In [34]:
# Distributing Weight in Dataset
X2 = X.copy()

X2['sex_weight'] = ''
w = bdf1.query('attribute_name == "SEX" and attribute_value == "female"')['weight_adjustment'].values[0]
X2['sex_weight'] = np.where(X2['SEX'] == "female", w, X2['sex_weight'])
w = bdf1.query('attribute_name == "SEX" and attribute_value == "male"')['weight_adjustment'].values[0]
X2['sex_weight'] = np.where(X2['SEX'] == "male", w, X2['sex_weight'])

X2['education_weight'] = ''
w = bdf1.query('attribute_name == "EDUCATION" and attribute_value == "graduate school"')['weight_adjustment'].values[0]
X2['education_weight'] = np.where(X2['EDUCATION'] == "graduate school", w, X2['education_weight'])
w = bdf1.query('attribute_name == "EDUCATION" and attribute_value == "high school"')['weight_adjustment'].values[0]
X2['education_weight'] = np.where(X2['EDUCATION'] == "high school", w, X2['education_weight'])
X2['education_weight'] = np.where(X2['EDUCATION'] == "graduate school", w, X2['education_weight'])
w = bdf1.query('attribute_name == "EDUCATION" and attribute_value == "others"')['weight_adjustment'].values[0]
X2['education_weight'] = np.where(X2['EDUCATION'] == "others", w, X2['education_weight'])
w = bdf1.query('attribute_name == "EDUCATION" and attribute_value == "university"')['weight_adjustment'].values[0]
X2['education_weight'] = np.where(X2['EDUCATION'] == "university", w, X2['education_weight'])
w = bdf1.query('attribute_name == "EDUCATION" and attribute_value == "unknown"')['weight_adjustment'].values[0]
X2['education_weight'] = np.where(X2['EDUCATION'] == "unknown", w, X2['education_weight'])

X2['marriage_weight'] = ''
w = bdf1.query('attribute_name == "MARRIAGE" and attribute_value == "married"')['weight_adjustment'].values[0]
X2['marriage_weight'] = np.where(X2['MARRIAGE'] == "married", w, X2['marriage_weight'])
w = bdf1.query('attribute_name == "MARRIAGE" and attribute_value == "single"')['weight_adjustment'].values[0]
X2['marriage_weight'] = np.where(X2['MARRIAGE'] == "single", w, X2['marriage_weight'])
w = bdf1.query('attribute_name == "MARRIAGE" and attribute_value == "others"')['weight_adjustment'].values[0]
X2['marriage_weight'] = np.where(X2['MARRIAGE'] == "others", w, X2['marriage_weight'])
w = bdf1.query('attribute_name == "MARRIAGE" and attribute_value == "unknown"')['weight_adjustment'].values[0]
X2['marriage_weight'] = np.where(X2['MARRIAGE'] == "unknown", w, X2['marriage_weight'])

X2['composite_weight'] = X2['sex_weight'] * X2['education_weight'] * X2['marriage_weight']
X2.drop(['sex_weight', 'education_weight', 'marriage_weight'], axis=1, inplace=True)
X2['composite_weight'] = X2['composite_weight'].astype('float64')

In [35]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, random_state=17)
sensitive_train2, sensitive_test2 = X_train2[['SEX', 'MARRIAGE', 'EDUCATION']], X_test2[['SEX', 'MARRIAGE', 'EDUCATION']]
weight_train = X_train2['composite_weight']
weight_test = X_test2['composite_weight']
X_train2.drop('composite_weight', axis=1, inplace=True)
X_test2.drop('composite_weight', axis=1, inplace=True)


# XGBoost Matrix
dtrain_reg2 = xgb.DMatrix(X_train2, y_train2, enable_categorical=True, weight=weight_train)
dtest_reg2 = xgb.DMatrix(X_test2, y_test2, enable_categorical=True, weight=weight_test)

In [41]:
# Defining Hyperparameters
params = {
    'Objective': 'reg:tweedie',
    # 'max_depth': 4,
    'learning_rate': 0.1,
    "num_class": 2
    # "tree_method": "gpu_hist"
}

In [42]:
# Defining and Training Model
n = 100
model2 = xgb.train(
    params=params,
    dtrain=dtrain_reg2,
    num_boost_round=n
)

In [44]:
# RMSE Validation
preds = model2.predict(dtest_reg2)
rmse = mean_squared_error(y_test2, preds, squared=False)

preds_xgb = np.where(preds < 0.5, 0, 1)
print(accuracy_score(y_test2, preds_xgb))
print(f1_score(y_test2, preds_xgb))

print(f"RMSE of the base model: {rmse:.3f}")

0.7928333333333333
0.44484144707458695
RMSE of the base model: 0.455


In [39]:
preds_xgb = pd.Series(preds_xgb)
y_test_di1 = pd.DataFrame(y_test2)
y_test_di1["pred"] = preds_xgb.values
df_out_group = pd.concat([X_test2[['SEX', 'MARRIAGE', 'EDUCATION']], y_test_di1], axis=1)
df_out_group.rename(columns={'pred': 'score', 'default-payment-next-month': 'label_value'}, inplace=True)

df_out_bias = pd.concat([X_test2, y_test_di1], axis=1)
df_out_bias.rename(columns={'pred': 'score', 'default-payment-next-month': 'label_value'}, inplace=True) 

g = Group()

df_out_group.dtypes
df_out_group['SEX'] = df_out_group['SEX'].astype('str')
df_out_group['MARRIAGE'] = df_out_group['MARRIAGE'].astype('str')
df_out_group['EDUCATION'] = df_out_group['EDUCATION'].astype('str')
xtab, _ = g.get_crosstabs(df_out_group)

b = Bias()

# Calculate DI
ref_groups_dict = {
    'SEX': 'female',
    'SEX': 'male',
    'MARRIAGE': 'married',
    'MARRIAGE': 'others',
    'MARRIAGE': 'single',
    'MARRIAGE': 'unknown',
    'EDUCATION': 'graduate school',
    'EDUCATION': 'high school',
    'EDUCATION': 'others',
    'EDUCATION': 'university',
    'EDUCATION': 'unknown',
}

bdf2 = b.get_disparity_predefined_groups(xtab, original_df=df_out_group, ref_groups_dict=ref_groups_dict)

# Calculate Disparate Impact (DI) manually and create a custom column
bdf2['disparity_index'] = 1 - abs(bdf1['fpr'] - 1)
bdf2['weight_adjustment'] = 1 - bdf1['disparity_index']

# Define the sensitive attribute categories
# Calculate the aggregate Normalized DI (simple average)
aggregate_normalized_di2 = bdf2['disparity_index'].sum() / len(bdf1['disparity_index'])

# Print the aggregate Normalized DI

print(f'Aggregate Normalized DI: {aggregate_normalized_di:.2f}')
print(f'Aggregate Normalized DI: {aggregate_normalized_di2:.2f}')
print(aggregate_normalized_di / aggregate_normalized_di2)

get_disparity_predefined_group()
Aggregate Normalized DI: 0.04
Aggregate Normalized DI: 0.04
1.0


In [40]:
X_final = df_prediction.drop('default-payment-next-month', axis=1)
dfinal_reg = xgb.DMatrix(X_final, enable_categorical=True)
final = model2.predict(dfinal_reg)

final = np.where(final < 0.5, 0, 1)
final = pd.DataFrame(final)



final.to_csv('./Data/final.csv')