In [1]:
!pip install pandas
!pip install scikit-learn
!pip install imblearn
!pip install tensorflow
!pip install xgboost

Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.2.2 tzdata-2024.1
Collecting scikit-learn
  Using cached scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Using cached scipy

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
import joblib

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


2024-07-31 15:10:49.506714: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 15:10:49.508775: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-31 15:10:49.512065: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-31 15:10:49.522942: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 15:10:49.541247: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [3]:
""" Standard Preprocessing
"""
CreditRisk =pd.read_csv('credit_risk_dataset.csv')

# Filter age and employment length
crData = CreditRisk[(CreditRisk['person_age'] <= 70) & (CreditRisk['person_emp_length'] < 47)].copy()

# Fill missing values and drop 'loan_grade' column
crData.loc[:, 'loan_int_rate'] = crData['loan_int_rate'].fillna(crData['loan_int_rate'].median())
crDataCopy = crData.drop('loan_grade', axis=1)

display(crDataCopy.shape)
crDataCopy.head()

(31671, 11)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


## Categorical Features Treament

In [4]:
crdataCat_tmnt = crDataCopy.copy()
person_home_ownership = pd.get_dummies(crdataCat_tmnt['person_home_ownership'], drop_first=True).astype(int)
loan_intent = pd.get_dummies(crdataCat_tmnt['loan_intent'], drop_first=True).astype(int)

# Convert default_on_file to binary
crdataCat_tmnt['cb_person_default_on_file_binary'] = np.where(crdataCat_tmnt['cb_person_default_on_file'] == 'Y', 1, 0)

# Data scaling
numeric_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(crdataCat_tmnt[numeric_columns]), 
                         columns=numeric_columns, index=crdataCat_tmnt.index)

# Combine scaled and categorical data
scaled_data_combined = pd.concat([scaled_df, person_home_ownership, loan_intent], axis=1)
scaled_data_combined['cb_person_default_on_file'] = crdataCat_tmnt['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = crdataCat_tmnt['loan_status']

# Separate features and target
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status', axis=1)

features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
1,-1.090587,-1.078051,0.054432,-1.367192,0.034115,-0.655113,-0.939656,0,1,0,1,0,0,0,0,0
2,-0.441211,-1.078051,-0.938456,-0.65681,0.597575,3.767461,-0.692664,0,0,0,0,0,1,0,0,0
3,-0.765899,-0.018803,-0.19379,4.000141,1.366226,3.391072,-0.939656,0,0,1,0,0,1,0,0,0
4,-0.603555,-0.229137,0.799097,4.000141,1.053554,3.579267,-0.445671,0,0,1,0,0,1,0,0,1
5,-1.090587,-1.072366,-0.690234,-1.130398,-1.268682,0.756347,-0.939656,0,1,0,0,0,0,0,1,0


## SMOTE - Synthetic Minority Over-Sampling technique

In [5]:
smote= SMOTE()
balanced_features, balanced_target = smote.fit_resample(features, target)
print ("Shape of Balanced target:", balanced_target.shape)
print("Class distribution:")
print(pd.Series(balanced_target).value_counts())

Shape of Balanced target: (49692,)
Class distribution:
loan_status
0    24846
1    24846
Name: count, dtype: int64


## Traditional Models Training

In [None]:
x_train, x_test, y_train, y_test = train_test_split (balanced_features, balanced_target, test_size=0.20, random_state=42)# Split the data

"""RF Model"""
rf= RandomForestClassifier ()
rf.fit (x_train, y_train)
print(rf.score(x_train, y_train))
rf_prediction = rf.predict(x_test)
features_imp_rf = pd.DataFrame ({'features' :balanced_features.columns,'rf_imp' : rf.feature_importances_})


""" XG Boost"""
xgb_model = XGBClassifier(tree_method = 'exact')
#model.fit(x,y.values.ravel())
xgb_model.fit(x_train,y_train.values.ravel())
print (xgb_model.score(x_train,y_train.values.ravel()))
xgb_prediction = xgb_model.predict(x_test)
features_imp_xgb = pd.DataFrame ({'features' :balanced_features.columns,'xgb_imp' : xgb_model.feature_importances_})


"""RNN Model"""
# Reshape input data for RNN
timesteps = 1  # Adjust this if want to consider temporal aspects
features_per_timestep = x_train.shape[1]
x_train_rnn = x_train.values.reshape((x_train.shape[0], timesteps, features_per_timestep))
x_test_rnn = x_test.values.reshape((x_test.shape[0], timesteps, features_per_timestep))

def create_rnn_model(input_dim, timesteps):
    model = Sequential([
        Bidirectional(LSTM(32, return_sequences=True), input_shape=(timesteps, input_dim)),
        Bidirectional(LSTM(16)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

rnn_model = create_rnn_model(features_per_timestep, timesteps)
history = rnn_model.fit(x_train_rnn, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=0)
rnn_prediction = (rnn_model.predict(x_test_rnn) > 0.5).astype(int)
#features_imp_rnn = pd.DataFrame ({'features' :balanced_features.columns,'rnn_imp' : rnn_model.feature_importances_})

"""DNN Model"""
def create_dnn_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(8, activation='relu'),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model
input_dim = x_train.shape[1]
dnn_model = create_dnn_model(input_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = dnn_model.fit(x_train, y_train,validation_split=0.2,epochs=50, batch_size=32,callbacks=[early_stopping],verbose=0)
dnn_predictions = (dnn_model.predict(x_test) > 0.5).astype(int)


1.0
0.96513470681458


  super().__init__(**kwargs)


In [11]:
""" Model Comparison"""
def get_metrics(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred)
    }

models = {
    'Random Forest': rf_prediction,
    'XGBoost': xgb_prediction,
    'RNN': rnn_prediction.flatten(),
    'DNN' : dnn_predictions.flatten()
}

comparison = pd.DataFrame({name: get_metrics(y_test, pred) for name, pred in models.items()}).T

print('Model comparison:')
print(comparison)

# If you need confusion matrices:
#confusion_matrices = {name: confusion_matrix(y_test, pred) for name, pred in models.items()}


Model comparison:
               Accuracy  f1_score    Recall  Precision
Random Forest  0.938827  0.936785  0.911206   0.963843
XGBoost        0.947379  0.945195  0.912217   0.980648
RNN            0.857430  0.852779  0.830097   0.876736
DNN            0.841131  0.830852  0.784385   0.883170


In [7]:
features_imp=pd.concat([features_imp_rf,features_imp_xgb],axis=1)
features_imp

Unnamed: 0,features,rf_imp,features.1,xgb_imp
0,person_age,0.061375,person_age,0.036954
1,person_income,0.150112,person_income,0.039096
2,person_emp_length,0.07516,person_emp_length,0.055949
3,loan_amnt,0.081652,loan_amnt,0.009795
4,loan_int_rate,0.203139,loan_int_rate,0.073065
5,loan_percent_income,0.207453,loan_percent_income,0.124535
6,cb_person_cred_hist_length,0.060456,cb_person_cred_hist_length,0.070651
7,OTHER,0.000455,OTHER,0.009498
8,OWN,0.020408,OWN,0.182591
9,RENT,0.046241,RENT,0.106168
