In [1]:
import sys
# !{sys.executable} -m pip install altair
# !{sys.executable} -m pip install altair_data_server
# !{sys.executable} -m pip install prince
# !{sys.executable} -m pip install xgboost
# !{sys.executable} -m pip install tensorflow
# !{sys.executable} -m pip install keras
# !{sys.executable} -m pip install scikeras

import pandas as pd
import numpy as np
import altair as alt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
#alt.data_transformers.enable('data_server')
alt.data_transformers.enable('json') 
import utils

RANDOM_SEED = 42

In [2]:
default_df = pd.read_excel('assets/default of credit card clients.xls', header=1, index_col=0)
default_df.rename(columns={'default payment next month': 'default', 'PAY_0': 'PAY_1'}, inplace=True)
default_df.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

In [4]:
# data transformation
def_cat_df = default_df.copy()
def_cat_df['AGE_GROUP'] = (def_cat_df.AGE / 10).apply(np.floor).astype(int)
def_cat_df['PAY_1_DUE'] = np.where(def_cat_df['PAY_1'] <= 0, 'YES', 'NO')
def_cat_df['PAY_2_DUE'] = np.where(def_cat_df['PAY_2'] <= 0, 'YES', 'NO')
def_cat_df['PAY_3_DUE'] = np.where(def_cat_df['PAY_3'] <= 0, 'YES', 'NO')
def_cat_df['PAY_4_DUE'] = np.where(def_cat_df['PAY_4'] <= 0, 'YES', 'NO')
def_cat_df['PAY_5_DUE'] = np.where(def_cat_df['PAY_5'] <= 0, 'YES', 'NO')
def_cat_df['PAY_6_DUE'] = np.where(def_cat_df['PAY_6'] <= 0, 'YES', 'NO')
def_cat_df = pd.get_dummies(def_cat_df, columns=['PAY_1_DUE', 'PAY_2_DUE', 'PAY_3_DUE', 'PAY_4_DUE', 'PAY_5_DUE', 'PAY_6_DUE'], drop_first=True)
def_cat_df.drop(columns=['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'AGE'], inplace=True)
def_cat_df['SEX'] = np.where(def_cat_df['SEX'] == 1, 'MALE', 'FEMALE')
def_cat_df = pd.get_dummies(def_cat_df, columns=['SEX'], drop_first=True)
def_cat_df = pd.get_dummies(def_cat_df, columns=['EDUCATION', 'MARRIAGE', 'AGE_GROUP'])
def_cat_df.head()

Unnamed: 0_level_0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,...,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3,AGE_GROUP_2,AGE_GROUP_3,AGE_GROUP_4,AGE_GROUP_5,AGE_GROUP_6,AGE_GROUP_7
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,3913,3102,689,0,0,0,0,689,0,...,0,1,0,0,1,0,0,0,0,0
2,120000,2682,1725,2682,3272,3455,3261,0,1000,1000,...,0,0,1,0,1,0,0,0,0,0
3,90000,29239,14027,13559,14331,14948,15549,1518,1500,1000,...,0,0,1,0,0,1,0,0,0,0
4,50000,46990,48233,49291,28314,28959,29547,2000,2019,1200,...,0,1,0,0,0,1,0,0,0,0
5,50000,8617,5670,35835,20940,19146,19131,2000,36681,10000,...,0,1,0,0,0,0,0,1,0,0


### Add in additional variables
- Indicator whether a user make all the payment continuously on all month derived from all Pay_x variables.

In [5]:
def_cat_df_added=def_cat_df.copy()
def_cat_df_added['Continuouse Indicator']=np.where((def_cat_df_added.PAY_AMT1>0) & (def_cat_df_added.PAY_AMT2>0)&(def_cat_df_added.PAY_AMT3>0)&(def_cat_df_added.PAY_AMT4>0)&(def_cat_df_added.PAY_AMT5>0)&(def_cat_df_added.PAY_AMT6>0), 
   "YES", 
   "NO")

def_cat_df_added = pd.get_dummies(def_cat_df_added, columns=['Continuouse Indicator'], drop_first=True)

## Random Forest Classifier experiment

In [6]:
RANDOM_SEED = 42
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Random Forest Classifier', clsf=rf_clf, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5470,403
Negative (actual),1052,575


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.59,0.35,0.44,0.75


Let's add the newly added variable and see the performance.

In [7]:
RANDOM_SEED = 42
X = def_cat_df_added.drop(columns=['default'])
y = def_cat_df_added['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Random Forest Classifier', clsf=rf_clf, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5478,395
Negative (actual),1047,580


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.59,0.36,0.45,0.76


## Let's use grid-search together with cross validation to find the best parameters for Random Forest Classifier.

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
#First, let's split the training data and testing data into equally two parts, and one used for in grid-search, and one used for cross validation to avoid data leakage.
RANDOM_SEED = 42
X = def_cat_df_added.drop(columns=['default'])
y = def_cat_df_added['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=RANDOM_SEED)



#Let's create the list of hyper-parameters that we would like to tune

# Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 100, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
               #,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}


rf_clf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rd_gridcv= GridSearchCV(rf_clf, random_grid, scoring = 'f1', cv = 5,n_jobs=-1)
#rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = RANDOM_SEED, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rd_gridcv.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(rd_gridcv.best_params_)

print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, rd_gridcv.predict(X_test)
print(classification_report(y_true, y_pred))
print()
print("The model f1 score is:")
f1_score(y_true, y_pred)

Best parameters set found on development set:
{'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 100}
The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.84      0.93      0.88     11725
           1       0.61      0.36      0.45      3275

    accuracy                           0.81     15000
   macro avg       0.72      0.65      0.67     15000
weighted avg       0.79      0.81      0.79     15000


The model f1 score is:


0.45198389879240947

## Gradient Boosting Classifier experiment (all features)

In [9]:
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

gbc = GradientBoostingClassifier(max_depth=8, random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Gradient Boosting Classifier', clsf=gbc, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5465,408
Negative (actual),1017,610


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


## Gradient Boosting Classifier experiment (all features +new created feature)

In [10]:
X = def_cat_df_added.drop(columns=['default'])
y = def_cat_df_added['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

gbc = GradientBoostingClassifier(max_depth=8, random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Gradient Boosting Classifier', clsf=gbc, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5474,399
Negative (actual),1028,599


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


## Gradient Boosting Classifier experiment (high importance features only)

In [11]:
## Let's now remove features with low importance and re-run model training on features with higher importance

low_imp_feat = ['SEX_MALE', 'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2', 'MARRIAGE_3',
                    'EDUCATION_0', 'EDUCATION_1', 'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4',
                    'EDUCATION_5', 'EDUCATION_6', 'AGE_GROUP_2', 'AGE_GROUP_3', 'AGE_GROUP_4', 'AGE_GROUP_5', 'AGE_GROUP_6', 'AGE_GROUP_7']

X = def_cat_df.drop(columns=['default'] + low_imp_feat)
y = def_cat_df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

gbc_high_imp = GradientBoostingClassifier(max_depth=8, random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Gradient Boosting Classifier (high importance features)', clsf=gbc_high_imp, X_test=X_test, y_test=y_test)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5470,403
Negative (actual),1025,602


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
#First, let's split the training data and testing data into equally two parts, and one used for in grid-search, and one used for cross validation to avoid data leakage.
RANDOM_SEED = 42
X = def_cat_df_added.drop(columns=['default'])
y = def_cat_df_added['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=RANDOM_SEED)



#Let's create the list of hyper-parameters that we would like to tune

# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start = 5, stop = 50, num = 3)]
# Number of features to consider at every split
learning_rate= [0.01, 0.05,  0.1]
subsample=[0.5, 0.7, 1.0]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 15, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
# min_samples_split = [2, 4, 8]
# Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
# Create the random grid
grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'subsample':subsample,
               'learning_rate':learning_rate}
               
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,



gb_clf = GradientBoostingClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gb_gridcv= GridSearchCV(gb_clf, grid, scoring = 'f1', cv = 5,n_jobs=-1)
# Fit the random search model
gb_gridcv.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(gb_gridcv.best_params_)

print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, gb_gridcv.predict(X_test)
print(classification_report(y_true, y_pred))
print()
print("The model f1 score is:")
f1_score(y_true, y_pred)

Best parameters set found on development set:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.7}
The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.84      0.94      0.89     11725
           1       0.63      0.37      0.47      3275

    accuracy                           0.82     15000
   macro avg       0.74      0.66      0.68     15000
weighted avg       0.80      0.82      0.80     15000


The model f1 score is:


0.4704977897366903

## XGBoost Classifier experiment

In [13]:
def_cat_df = def_cat_df.astype(int)
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)
xgbc = XGBClassifier(objective='binary:logistic', use_label_encoder=False, random_state=RANDOM_SEED, gamma=0.25).fit(X_train, y_train)

utils.get_classifier_summary('XGBoost Classifier', xgbc, X_test=X_test, y_test=y_test)



Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),5479,394
Negative (actual),1027,600


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.6,0.37,0.46,0.76


## Neural Network classifier experiment

In [14]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential

numeric_cols = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
                'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
                'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

def_cat_df = def_cat_df.astype(int)
X = def_cat_df.drop(columns=['default'])
y = def_cat_df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

scaler = StandardScaler()
# scaler.fit(X)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
scaler.fit(X[numeric_cols])
X_train_num_scaled = scaler.transform(X_train[numeric_cols])
X_train = np.concatenate([X_train_num_scaled, X_train.drop(columns=numeric_cols).to_numpy()], axis=1)
X_test_num_scaled = scaler.transform(X_test[numeric_cols])
X_test = np.concatenate([X_test_num_scaled, X_test.drop(columns=numeric_cols).to_numpy()], axis=1)

def create_model():
    model = Sequential()
    model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.2, seed=RANDOM_SEED))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32, verbose=1)
model.fit(X_train, y_train, validation_data=(X_test, y_test))

  model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32, verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2b7e87faf760>

## Testing models and feature (scaling)

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
temp_df = default_df.copy()
temp_X_df = temp_df.drop('default',axis=1)
temp_y_df = temp_df[['default']]

In [21]:
train_df, val_df, test_df = np.split(temp_df.sample(frac=1,random_state=RANDOM_SEED), [int(.6*len(temp_df)), int(.8*len(temp_df))])

X_train, y_train = train_df.drop('default', axis=1), train_df['default']
X_val, y_val = val_df.drop('default', axis=1), val_df['default']
X_test, y_test = test_df.drop('default', axis=1), test_df['default']


In [22]:
scaler = StandardScaler().fit(X_train)
scaled_X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
scaled_X_val = pd.DataFrame(scaler.transform(X_val), index=X_val.index, columns=X_val.columns)
scaled_X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [23]:
rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Random Forest Classifier', clsf=rf_clf, X_test=X_val, y_test=y_val)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),4366,244
Negative (actual),874,516


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.81,0.68,0.37,0.48,0.77


In [24]:
gb_clf = GradientBoostingClassifier(max_depth=8, random_state=RANDOM_SEED).fit(X_train, y_train)

utils.get_classifier_summary(clsf_type='Gradient Boosting Classifier', clsf=gb_clf, X_test=X_val, y_test=y_val)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),4374,236
Negative (actual),867,523


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.82,0.69,0.38,0.49,0.77


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

In [52]:
lr = LogisticRegression(random_state=RANDOM_SEED,solver='lbfgs', max_iter=3000).fit(X_train,y_train)
gnb = GaussianNB().fit(X_train,y_train)
svc = LinearSVC(random_state=RANDOM_SEED, max_iter=3000,dual=False).fit(X_train,y_train)

scaled_lr = LogisticRegression(random_state=RANDOM_SEED,solver='lbfgs', max_iter=3000).fit(scaled_X_train,y_train)
scaled_gnb = GaussianNB().fit(scaled_X_train,y_train)
scaled_svc = LinearSVC(random_state=RANDOM_SEED, max_iter=3000,dual=False).fit(scaled_X_train,y_train)

In [53]:
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score, \
    precision_recall_curve

def get_classifier_summary(clsf_type, clsf, X_test, y_test):

    y_predicted = clsf.predict(X_test)
    confusion = confusion_matrix(y_test, y_predicted)
    confusion_df = pd.DataFrame(data=confusion, columns=['Positive (predicted)', 'Negative (predicted)'],
                                index=['Positive (actual)', 'Negative (actual)'])
    print('Confusion matrix')
    display(confusion_df)

    y_pred_proba = clsf.predict_proba(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
    roc_auc = auc(fpr, tpr)

    metrics_df = pd.DataFrame(data={'Accuracy': [accuracy_score(y_test, y_predicted)],
                                    'Precision': [precision_score(y_test, y_predicted)],
                                    'Recall': [recall_score(y_test, y_predicted)],
                                    'F1 score': [f1_score(y_test, y_predicted)],
                                    'ROC AUC': [roc_auc]})

    print('Classifier metrics')
    display(metrics_df.round(2))

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])
    plot_metrics(fpr=fpr, tpr=tpr, prec=precision, rec=recall)

def plot_metrics(fpr, tpr, prec, rec):
    roc_df = pd.DataFrame(data={'fpr': fpr, 'tpr': tpr})
    prec_rec_df = pd.DataFrame(data={'prec': prec, 'rec': rec})
    line_df = pd.DataFrame({'fpr': [0, 1], 'tpr': [0, 1]})

    roc_ch = alt.Chart(data=roc_df, title='ROC curve').mark_line().encode(
        x=alt.X('fpr:Q', title='False Positive Rate'),
        y=alt.Y('tpr:Q', title='True Positive Rate')
    ).properties(width=250, height=300)

    line_ch = alt.Chart(data=line_df).mark_line(strokeWidth=0.5, strokeDash=[5, 5]).encode(
        x=alt.X('fpr:Q'),
        y=alt.Y('tpr:Q')
    )

    prec_rec_ch = alt.Chart(data=prec_rec_df, title='Precision-Recall curve').mark_line().encode(
        x=alt.X('prec:Q', title='Precision'),
        y=alt.Y('rec:Q', title='Recall')
    ).properties(width=250, height=300)

    ((roc_ch + line_ch) | prec_rec_ch).display()

In [54]:
get_classifier_summary(clsf_type='Logistic Regression', clsf=lr, X_test=X_val, y_test=y_val)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),4609,1
Negative (actual),1390,0


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.77,0.0,0.0,0.0,0.63


In [48]:
get_classifier_summary(clsf_type='Gaussian NB', clsf=gnb, X_test=X_val, y_test=y_val)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),1280,3330
Negative (actual),213,1177


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.41,0.26,0.85,0.4,0.66


In [49]:
get_classifier_summary(clsf_type='Scaled Logistic Regression', clsf=lr, X_test=scaled_X_val, y_test=y_val)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),2110,2500
Negative (actual),620,770


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.48,0.24,0.55,0.33,0.53


In [50]:
get_classifier_summary(clsf_type='Scaled Gaussian NB', clsf=gnb, X_test=scaled_X_val, y_test=y_val)

Confusion matrix


Unnamed: 0,Positive (predicted),Negative (predicted)
Positive (actual),0,4610
Negative (actual),0,1390


Classifier metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 score,ROC AUC
0,0.23,0.23,1.0,0.38,0.67
