In [None]:
!pip install pandas
!pip install scikit-learn
!pip install imblearn
!pip install tensorflow
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
import joblib

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
""" Standard Preprocessing
"""
CreditRisk =pd.read_csv('credit_risk_dataset.csv')

# Filter age and employment length
crData = CreditRisk[(CreditRisk['person_age'] <= 70) & (CreditRisk['person_emp_length'] < 47)].copy()

# Fill missing values and drop 'loan_grade' column
crData.loc[:, 'loan_int_rate'] = crData['loan_int_rate'].fillna(crData['loan_int_rate'].median())
crDataCopy = crData.drop('loan_grade', axis=1)

display(crDataCopy.shape)
crDataCopy.head()

## Categorical Features Treament

In [None]:
crdataCat_tmnt = crDataCopy.copy()
person_home_ownership = pd.get_dummies(crdataCat_tmnt['person_home_ownership'], drop_first=True).astype(int)
loan_intent = pd.get_dummies(crdataCat_tmnt['loan_intent'], drop_first=True).astype(int)

# Convert default_on_file to binary
crdataCat_tmnt['cb_person_default_on_file_binary'] = np.where(crdataCat_tmnt['cb_person_default_on_file'] == 'Y', 1, 0)

# Data scaling
numeric_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(crdataCat_tmnt[numeric_columns]), 
                         columns=numeric_columns, index=crdataCat_tmnt.index)

# Combine scaled and categorical data
scaled_data_combined = pd.concat([scaled_df, person_home_ownership, loan_intent], axis=1)
scaled_data_combined['cb_person_default_on_file'] = crdataCat_tmnt['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = crdataCat_tmnt['loan_status']

# Separate features and target
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status', axis=1)

features.head()

## SMOTE - Synthetic Minority Over-Sampling technique

In [None]:
smote= SMOTE()
balanced_features, balanced_target = smote.fit_resample(features, target)
print ("Shape of Balanced target:", balanced_target.shape)
print("Class distribution:")
print(pd.Series(balanced_target).value_counts())

## Traditional Models Training

In [None]:
x_train, x_test, y_train, y_test = train_test_split (balanced_features, balanced_target, test_size=0.20, random_state=42)# Split the data

"""RF Model"""
rf= RandomForestClassifier ()
rf.fit (x_train, y_train)
print(rf.score(x_train, y_train))
rf_prediction = rf.predict(x_test)
features_imp_rf = pd.DataFrame ({'features' :balanced_features.columns,'rf_imp' : rf.feature_importances_})


""" XG Boost"""
xgb_model = XGBClassifier(tree_method = 'exact')
#model.fit(x,y.values.ravel())
xgb_model.fit(x_train,y_train.values.ravel())
print (xgb_model.score(x_train,y_train.values.ravel()))
xgb_prediction = xgb_model.predict(x_test)
features_imp_xgb = pd.DataFrame ({'features' :balanced_features.columns,'xgb_imp' : xgb_model.feature_importances_})


"""RNN Model"""
# Reshape input data for RNN
timesteps = 1  # Adjust this if want to consider temporal aspects
features_per_timestep = x_train.shape[1]
x_train_rnn = x_train.values.reshape((x_train.shape[0], timesteps, features_per_timestep))
x_test_rnn = x_test.values.reshape((x_test.shape[0], timesteps, features_per_timestep))

def create_rnn_model(input_dim, timesteps):
    model = Sequential([
        Bidirectional(LSTM(32, return_sequences=True), input_shape=(timesteps, input_dim)),
        Bidirectional(LSTM(16)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

rnn_model = create_rnn_model(features_per_timestep, timesteps)
history = rnn_model.fit(x_train_rnn, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=0)
rnn_prediction = (rnn_model.predict(x_test_rnn) > 0.5).astype(int)
#features_imp_rnn = pd.DataFrame ({'features' :balanced_features.columns,'rnn_imp' : rnn_model.feature_importances_})

def create_dnn_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(8, activation='relu'),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model
input_dim = x_train.shape[1]
dnn_model = create_dnn_model(input_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = dnn_model.fit(x_train, y_train,validation_split=0.2,epochs=50, batch_size=32,callbacks=[early_stopping],verbose=0)
dnn_predictions = (dnn_model.predict(x_test) > 0.5).astype(int)


In [None]:
features_imp=pd.concat([features_imp_rf,features_imp_xgb],axis=1)
features_imp

In [None]:
""" Model Comparison"""
def get_metrics(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred)
    }

models = {
    'Random Forest': rf_prediction,
    'XGBoost': xgb_prediction,
    'RNN': rnn_prediction.flatten(),
    'DNN' : dnn_predictions.flatten()
}

comparison = pd.DataFrame({name: get_metrics(y_test, pred) for name, pred in models.items()}).T

print('Model comparison:')
print(comparison)

# If you need confusion matrices:
#confusion_matrices = {name: confusion_matrix(y_test, pred) for name, pred in models.items()}
