## Import

In [13]:
# Data Librairies
import pandas as pd
import numpy as np

# Data Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score
import xgboost as xgb


# Data Vizualization Librairies
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings("ignore")

### Load Dataset

In [14]:
df = pd.read_csv("./Data/data_participant.csv", index_col=None)
df.describe()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default-payment-next-month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,24000.0
mean,15000.5,167484.322667,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.252333
std,8660.398374,129747.661567,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.434361
min,1.0,10000.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,1.0
max,30000.0,1000000.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [15]:
df.describe(exclude=np.number) # Describe Categorical Values

Unnamed: 0,SEX,EDUCATION,MARRIAGE
count,30000,30000,30000
unique,2,5,4
top,female,university,single
freq,18112,14030,15964


In [16]:
cats = df.select_dtypes(exclude=np.number).columns.tolist()
for cat in cats:
    df[cat] = df[cat].astype('category')
df.dtypes

ID                               int64
LIMIT_BAL                        int64
SEX                           category
EDUCATION                     category
MARRIAGE                      category
AGE                              int64
PAY_0                            int64
PAY_2                            int64
PAY_3                            int64
PAY_4                            int64
PAY_5                            int64
PAY_6                            int64
BILL_AMT1                        int64
BILL_AMT2                        int64
BILL_AMT3                        int64
BILL_AMT4                        int64
BILL_AMT5                        int64
BILL_AMT6                        int64
PAY_AMT1                         int64
PAY_AMT2                         int64
PAY_AMT3                         int64
PAY_AMT4                         int64
PAY_AMT5                         int64
PAY_AMT6                         int64
default-payment-next-month     float64
dtype: object

In [17]:
# Creating Target and Explicative Datasets
df_training = df[(~df['default-payment-next-month'].isna())]
df_prediction = df[(df['default-payment-next-month'].isna())]

X, y = df_training.drop(['ID', 'default-payment-next-month'], axis=1), df_training['default-payment-next-month']

## Creating Train and Test Datasets

In [18]:
# Standard Sklearn Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.3)
# Converting to Xgboost Matrix
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [19]:
# Defining Hyperparameters
params = {
    "objective": "binary:logistic",
    # "tree_method": "gpu_hist"
}

In [20]:
# Defining and Training Model
n = 100
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [21]:
# RMSE Validation
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 0.405


In [22]:
# F1_Score Validation

In [23]:
# Faireness Validation