In [1]:
!pip install pandas
!pip install scikit-learn
!pip install imblearn
!pip install tensorflow
!pip install xgboost
!pip install keras_tuner
!pip install shap
!pip install scikeras



In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from keras_tuner import RandomSearch
from tensorflow.keras.layers import Input, concatenate, Dense, Dropout, BatchNormalization, LSTM, Bidirectional
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier

In [37]:
""" Standard Preprocessing
"""
CreditRisk =pd.read_csv('credit_risk_dataset.csv')

# Filter age and employment length
crData = CreditRisk[(CreditRisk['person_age'] <= 70) & (CreditRisk['person_emp_length'] < 47)].copy()

# Fill missing values and drop 'loan_grade' column
crData.loc[:, 'loan_int_rate'] = crData['loan_int_rate'].fillna(crData['loan_int_rate'].median())
crDataCopy = crData.drop('loan_grade', axis=1)

display(crDataCopy.shape)
crDataCopy.head()

(31671, 11)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


## Categorical Features Treament

In [38]:
crdataCat_tmnt = crDataCopy.copy()
person_home_ownership = pd.get_dummies(crdataCat_tmnt['person_home_ownership'], drop_first=True).astype(int)
loan_intent = pd.get_dummies(crdataCat_tmnt['loan_intent'], drop_first=True).astype(int)

# Convert default_on_file to binary
crdataCat_tmnt['cb_person_default_on_file_binary'] = np.where(crdataCat_tmnt['cb_person_default_on_file'] == 'Y', 1, 0)

# Data scaling
numeric_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(crdataCat_tmnt[numeric_columns]), 
                         columns=numeric_columns, index=crdataCat_tmnt.index)

# Combine scaled and categorical data
scaled_data_combined = pd.concat([scaled_df, person_home_ownership, loan_intent], axis=1)
scaled_data_combined['cb_person_default_on_file'] = crdataCat_tmnt['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = crdataCat_tmnt['loan_status']

# Separate features and target
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status', axis=1)

features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
1,-1.090587,-1.078051,0.054432,-1.367192,0.034115,-0.655113,-0.939656,0,1,0,1,0,0,0,0,0
2,-0.441211,-1.078051,-0.938456,-0.65681,0.597575,3.767461,-0.692664,0,0,0,0,0,1,0,0,0
3,-0.765899,-0.018803,-0.19379,4.000141,1.366226,3.391072,-0.939656,0,0,1,0,0,1,0,0,0
4,-0.603555,-0.229137,0.799097,4.000141,1.053554,3.579267,-0.445671,0,0,1,0,0,1,0,0,1
5,-1.090587,-1.072366,-0.690234,-1.130398,-1.268682,0.756347,-0.939656,0,1,0,0,0,0,0,1,0


In [39]:
smote= SMOTE()
balanced_features, balanced_target = smote.fit_resample(features, target)
print ("Shape of Balanced target:", balanced_target.shape)
print("Class distribution:")
print(pd.Series(balanced_target).value_counts())

Shape of Balanced target: (49692,)
Class distribution:
loan_status
0    24846
1    24846
Name: count, dtype: int64


## Traditional Models Training

In [40]:
x_train, x_test, y_train, y_test = train_test_split (balanced_features, balanced_target, test_size=0.20, random_state=42)# Split the data


# Define pipelines for each model
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(tree_method='exact', random_state=42))
])



# Fit pipelines
rf_pipeline.fit(x_train, y_train)
xgb_pipeline.fit(x_train, y_train)


rf_prediction = rf_pipeline.predict(x_test)
xgb_prediction = xgb_pipeline.predict(x_test)

# Get cross-validated predictions for training data
rf_cv_pred = cross_val_predict(rf_pipeline, x_train, y_train, cv=5, method='predict_proba')[:, 1]
xgb_cv_pred = cross_val_predict(xgb_pipeline, x_train, y_train, cv=5, method='predict_proba')[:, 1]

# Implement Wide & Deep Network with RF and XGBoost predictions as input
def build_wide_and_deep_model(input_dim):
    # Original features input
    feature_input = Input(shape=(input_dim,), name='feature_input')
    
    # RF and XGBoost predictions input
    model_preds_input = Input(shape=(2,), name='model_preds_input')
    
    # Wide part
    wide_output = Dense(1, activation='sigmoid')(model_preds_input)

    # Deep part
    deep = Dense(64, activation='relu')(feature_input)
    deep = Dense(32, activation='relu')(deep)
    deep = Dense(16, activation='relu')(deep)
    deep_output = Dense(1, activation='sigmoid')(deep)

    # Combine wide and deep
    combined_output = concatenate([wide_output, deep_output])
    final_output = Dense(1, activation='sigmoid')(combined_output)

    model = Model(inputs=[feature_input, model_preds_input], outputs=final_output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Prepare data for Wide & Deep Network
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Combine original features with RF and XGBoost predictions
x_train_combined = [x_train_scaled, np.column_stack((rf_cv_pred, xgb_cv_pred))]
x_test_combined = [x_test_scaled, np.column_stack((rf_pipeline.predict_proba(x_test)[:, 1], 
                                                   xgb_pipeline.predict_proba(x_test)[:, 1]))]

# Build and train Wide & Deep Network
wd_model = build_wide_and_deep_model(x_train.shape[1])
wd_model.fit(x_train_combined, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# Make predictions with Wide & Deep Network
wd_prediction = (wd_model.predict(x_test_combined) > 0.5).astype(int).flatten()

[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [41]:
""" Model Comparison"""
def get_metrics(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred)
    }

models = {
    'Random Forest': rf_prediction,
    'XGBoost': xgb_prediction,
    'Wide & Deep': wd_prediction
}

comparison = pd.DataFrame({name: get_metrics(y_test, pred) for name, pred in models.items()}).T
print('Model comparison:')
print(comparison)


# Cross-validation
cv_scores = {}
for name, pipeline in [('Random Forest', rf_pipeline), ('XGBoost', xgb_pipeline)]:
    scores = cross_val_score(pipeline, balanced_features, balanced_target, cv=5, scoring='accuracy')
    cv_scores[name] = scores.mean()

print('\nCross-validation scores:')
print(pd.Series(cv_scores))


Model comparison:
               Accuracy  f1_score    Recall  Precision
Random Forest  0.937217  0.934973  0.907362   0.964316
XGBoost        0.947278  0.945073  0.911812   0.980853
Wide & Deep    0.951102  0.949824  0.930421   0.970055

Cross-validation scores:
Random Forest    0.838927
XGBoost          0.847702
dtype: float64


In [None]:
features_imp=pd.concat([features_imp_rf,features_imp_xgb],axis=1)
features_imp