In [1]:
import sys
!{sys.executable} -m pip install altair
!{sys.executable} -m pip install altair_data_server
!{sys.executable} -m pip install prince
!{sys.executable} -m pip install xgboost
!{sys.executable} -m pip install tensorflow
!{sys.executable} -m pip install keras
!{sys.executable} -m pip install scikeras

import pandas as pd
import numpy as np
import altair as alt
from xgboost import XGBClassifier
alt.data_transformers.enable('data_server')
import utils

RANDOM_SEED = 42



In [2]:
default_df = pd.read_excel('assets/default of credit card clients.xls', header=1, index_col=0)
default_df.rename(columns={'default payment next month': 'default', 'PAY_0': 'PAY_1'}, inplace=True)
default_df.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

In [4]:
# data transformation
def_cat_df = default_df.copy()
def_cat_df['AGE_GROUP'] = (def_cat_df.AGE / 10).apply(np.floor).astype(int)
def_cat_df['PAY_1_DUE'] = np.where(def_cat_df['PAY_1'] <= 0, 'YES', 'NO')
def_cat_df['PAY_2_DUE'] = np.where(def_cat_df['PAY_2'] <= 0, 'YES', 'NO')
def_cat_df['PAY_3_DUE'] = np.where(def_cat_df['PAY_3'] <= 0, 'YES', 'NO')
def_cat_df['PAY_4_DUE'] = np.where(def_cat_df['PAY_4'] <= 0, 'YES', 'NO')
def_cat_df['PAY_5_DUE'] = np.where(def_cat_df['PAY_5'] <= 0, 'YES', 'NO')
def_cat_df['PAY_6_DUE'] = np.where(def_cat_df['PAY_6'] <= 0, 'YES', 'NO')
def_cat_df = pd.get_dummies(def_cat_df, columns=['PAY_1_DUE', 'PAY_2_DUE', 'PAY_3_DUE', 'PAY_4_DUE', 'PAY_5_DUE', 'PAY_6_DUE'], drop_first=True)
def_cat_df.drop(columns=['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'AGE'], inplace=True)
def_cat_df['SEX'] = np.where(def_cat_df['SEX'] == 1, 'MALE', 'FEMALE')
def_cat_df = pd.get_dummies(def_cat_df, columns=['SEX'], drop_first=True)
def_cat_df = pd.get_dummies(def_cat_df, columns=['EDUCATION', 'MARRIAGE', 'AGE_GROUP'])
def_cat_df.head()

Unnamed: 0_level_0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,...,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3,AGE_GROUP_2,AGE_GROUP_3,AGE_GROUP_4,AGE_GROUP_5,AGE_GROUP_6,AGE_GROUP_7
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,3913,3102,689,0,0,0,0,689,0,...,0,1,0,0,1,0,0,0,0,0
2,120000,2682,1725,2682,3272,3455,3261,0,1000,1000,...,0,0,1,0,1,0,0,0,0,0
3,90000,29239,14027,13559,14331,14948,15549,1518,1500,1000,...,0,0,1,0,0,1,0,0,0,0
4,50000,46990,48233,49291,28314,28959,29547,2000,2019,1200,...,0,1,0,0,0,1,0,0,0,0
5,50000,8617,5670,35835,20940,19146,19131,2000,36681,10000,...,0,1,0,0,0,0,0,1,0,0


## Random Forest Classifier experiment

In [5]:
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Random Forest Classifier', clsf=rf_clf, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5470,403
Negative (actual),1052,575


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.59,0.35,0.44,0.75


## Gradient Boosting Classifier experiment (all features)

In [6]:
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

gbc = GradientBoostingClassifier(max_depth=8, random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Gradient Boosting Classifier', clsf=gbc, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5466,407
Negative (actual),1017,610


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


## Gradient Boosting Classifier experiment (high importance features only)

In [7]:
## Let's now remove features with low importance and re-run model training on features with higher importance

low_imp_feat = ['SEX_MALE', 'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2', 'MARRIAGE_3',
                    'EDUCATION_0', 'EDUCATION_1', 'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4',
                    'EDUCATION_5', 'EDUCATION_6', 'AGE_GROUP_2', 'AGE_GROUP_3', 'AGE_GROUP_4', 'AGE_GROUP_5', 'AGE_GROUP_6', 'AGE_GROUP_7']

X = def_cat_df.drop(columns=['default'] + low_imp_feat)
y = def_cat_df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

gbc_high_imp = GradientBoostingClassifier(max_depth=8, random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Gradient Boosting Classifier (high importance features)', clsf=gbc_high_imp, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5471,402
Negative (actual),1024,603


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


## XGBoost Classifier experiment

In [8]:
def_cat_df = def_cat_df.astype(int)
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)
xgbc = XGBClassifier(objective='binary:logistic', use_label_encoder=False, random_state=RANDOM_SEED, gamma=0.25).fit(X_train, y_train)

utils.get_classifier_summary('XGBoost Classifier', xgbc, X_test=X_test, y_test=y_test)



Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5479,394
Negative (actual),1027,600


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


## Neural Network classifier experiment

In [12]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential

numeric_cols = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
                'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
                'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

def_cat_df = def_cat_df.astype(int)
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

scaler = StandardScaler()
# scaler.fit(X)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
scaler.fit(X[numeric_cols])
X_train_num_scaled = scaler.transform(X_train[numeric_cols])
X_train = np.concatenate([X_train_num_scaled, X_train.drop(columns=numeric_cols).to_numpy()], axis=1)
X_test_num_scaled = scaler.transform(X_test[numeric_cols])
X_test = np.concatenate([X_test_num_scaled, X_test.drop(columns=numeric_cols).to_numpy()], axis=1)

def create_model():
    model = Sequential()
    model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.2, seed=RANDOM_SEED))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32, verbose=1)
model.fit(X_train, y_train, validation_data=(X_test, y_test))

  model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32, verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x14940bf9ee0>