## Import

In [63]:
# Data Librairies
import pandas as pd
import numpy as np

# Data Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import xgboost as xgb

# Data Vizualization Librairies
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings("ignore")

## Fonctions utilitaires

In [43]:
def getAgeCat(age: int) -> str:
    cat: str = ""

    if(age < 35):
        cat = "34 & -"
    elif (age >=35 and age < 45):
        cat = "35-44"
    elif (age >= 45 and age < 55):
        cat = "45-54"
    elif (age >= 55 and age < 65):
        cat = "55-64"
    else:
        cat = "65 & +"

    return cat

### Load Dataset

In [44]:
df = pd.read_csv("./Data/data_participant.csv", index_col=None)
df.describe()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default-payment-next-month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,24000.0
mean,15000.5,167484.322667,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.252333
std,8660.398374,129747.661567,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.434361
min,1.0,10000.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,1.0
max,30000.0,1000000.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [45]:
# Create "Remaining Amount" Column
df["RMN_AMT2"] = df["BILL_AMT2"] - df["PAY_AMT1"]
df["RMN_AMT3"] = df["BILL_AMT3"] - df["PAY_AMT2"]
df["RMN_AMT4"] = df["BILL_AMT4"] - df["PAY_AMT3"]
df["RMN_AMT5"] = df["BILL_AMT5"] - df["PAY_AMT4"]
df["RMN_AMT6"] = df["BILL_AMT6"] - df["PAY_AMT5"]

# Create "Sufficient Payment"
df["SUFFICIENT_PAYMENT_AMT2"] = np.where(df["RMN_AMT2"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT3"] = np.where(df["RMN_AMT3"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT4"] = np.where(df["RMN_AMT4"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT5"] = np.where(df["RMN_AMT5"] <= 0, 1, 0)
df["SUFFICIENT_PAYMENT_AMT6"] = np.where(df["RMN_AMT6"] <= 0, 1, 0)

# Add weight to most recent months
df["SUFFICIENT_PAYMENT_AMT2"] = df["SUFFICIENT_PAYMENT_AMT2"]*1.4
df["SUFFICIENT_PAYMENT_AMT3"] = df["SUFFICIENT_PAYMENT_AMT3"]*1.3
df["SUFFICIENT_PAYMENT_AMT4"] = df["SUFFICIENT_PAYMENT_AMT4"]*1.2
df["SUFFICIENT_PAYMENT_AMT5"] = df["SUFFICIENT_PAYMENT_AMT5"]*1.1
df["SUFFICIENT_PAYMENT_AMT6"] = df["SUFFICIENT_PAYMENT_AMT6"]*1

df.drop(['RMN_AMT2', 'RMN_AMT3', 'RMN_AMT4', 'RMN_AMT5', 'RMN_AMT6'], axis=1)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default-payment-next-month,SUFFICIENT_PAYMENT_AMT2,SUFFICIENT_PAYMENT_AMT3,SUFFICIENT_PAYMENT_AMT4,SUFFICIENT_PAYMENT_AMT5,SUFFICIENT_PAYMENT_AMT6
0,15065,240000,male,graduate school,married,46,-2,-2,-2,-2,...,11705,9473,1405,164705,1.0,1.4,1.3,1.2,1.1,1
1,29264,80000,male,high school,single,23,-1,2,-1,-1,...,125,0,0,0,0.0,0.0,1.3,1.2,1.1,1
2,26266,30000,female,university,married,46,1,2,0,0,...,1200,500,737,0,0.0,0.0,0.0,0.0,0.0,0
3,2371,210000,male,university,single,28,0,0,0,0,...,6500,6500,6300,7900,0.0,0.0,0.0,0.0,0.0,0
4,20112,230000,female,graduate school,married,36,-2,-2,-2,-2,...,2427,282,282,1886,0.0,1.4,1.3,1.2,1.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,12269,110000,female,university,single,25,2,0,0,0,...,3000,5000,10000,3000,,0.0,0.0,0.0,0.0,0
29996,22101,50000,female,university,single,28,0,0,0,0,...,1446,1500,1500,1073,,0.0,0.0,0.0,0.0,0
29997,1380,80000,male,high school,single,26,1,2,2,2,...,2800,0,7000,8,,0.0,0.0,0.0,0.0,0
29998,8396,360000,female,graduate school,single,29,0,0,0,0,...,41,22,9865,14847,,0.0,0.0,0.0,0.0,1


In [46]:
df.describe(exclude=np.number) # Describe Categorical Values

Unnamed: 0,SEX,EDUCATION,MARRIAGE
count,30000,30000,30000
unique,2,5,4
top,female,university,single
freq,18112,14030,15964


In [47]:
cats = df.select_dtypes(exclude=np.number).columns.tolist()
for cat in cats:
    df[cat] = df[cat].astype('category')

In [48]:
# Finding Protected Groups
df['group'] = df.apply(lambda x: x['SEX'] + '-' + x['EDUCATION'] + '-' + x['MARRIAGE'], axis=1)
noDefault = df[df['default-payment-next-month'] == 0].groupby(['group'])['default-payment-next-month'].count().rename("noDefault")
default = df[df['default-payment-next-month'] == 1].groupby(['group'])['default-payment-next-month'].count().rename("default")
df_groups = pd.concat([noDefault, default], axis=1).fillna(0)
df_groups["total"] = df_groups.apply(lambda x : x["noDefault"] + x["default"], axis=1)
df_groups["propDefault"] = df_groups.apply(lambda x : x["default"]/x["total"], axis=1)
propDefault_mean = df_groups["propDefault"].mean()
propDefault_median = df_groups["propDefault"].median()

df_groups['protected'] = np.where(df_groups['propDefault'] > propDefault_median, 1, 0)
df_groups

Unnamed: 0_level_0,noDefault,default,total,propDefault,protected
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female-graduate school-married,1259,359.0,1618.0,0.221879,1
female-graduate school-others,14,1.0,15.0,0.066667,0
female-graduate school-single,2773,531.0,3304.0,0.160714,0
female-graduate school-unknown,3,0.0,3.0,0.0,0
female-high school-married,981,481.0,1462.0,0.329001,1
female-high school-others,37,17.0,54.0,0.314815,1
female-high school-single,600,199.0,799.0,0.249061,1
female-high school-unknown,23,5.0,28.0,0.178571,0
female-others-married,26,2.0,28.0,0.071429,0
female-others-others,1,0.0,1.0,0.0,0


In [49]:
# Tagging Protected Group
df = df.merge(df_groups['protected'], on='group')
df.set_index(['ID'], inplace=True)
df.drop(['group'], axis=1, inplace=True)

In [50]:
# Creating Target and Explicative Datasets
df_training = df[(~df['default-payment-next-month'].isna())]
df_prediction = df[(df['default-payment-next-month'].isna())]

X, y = df_training.drop(['default-payment-next-month'], axis=1), df_training['default-payment-next-month']

## Creating Train and Test Datasets

In [51]:
# Standard Sklearn Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
# XGBoost Matrix
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [60]:
# Defining Hyperparameters
params = {
    "objective": "binary:logistic",
    # "num_class": 2
    # "tree_method": "gpu_hist"
}

In [61]:
# Defining and Training Model
n = 50
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [62]:
# RMSE Validation
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)

preds_xgb = np.where(preds < 0.5, 0, 1)
print(accuracy_score(y_test, preds_xgb))
print(f1_score(y_test, preds_xgb))

print(f"RMSE of the base model: {rmse:.3f}")

0.781
0.42720139494333037
RMSE of the base model: 0.402


In [55]:
# F1_Score Validation

In [56]:
# Faireness Validation