In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
import joblib


In [2]:
""" Standard Preprocessing
"""
CreditRisk =pd.read_csv('credit_risk_dataset.csv')

# Filter age and employment length
crData = CreditRisk[(CreditRisk['person_age'] <= 70) & (CreditRisk['person_emp_length'] < 47)].copy()

# Fill missing values and drop 'loan_grade' column
crData.loc[:, 'loan_int_rate'] = crData['loan_int_rate'].fillna(crData['loan_int_rate'].median())
crDataCopy = crData.drop('loan_grade', axis=1)

display(crDataCopy.shape)
crDataCopy.head()

(31671, 11)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


## Categorical Features Treament

In [3]:
crdataCat_tmnt = crDataCopy.copy()
person_home_ownership = pd.get_dummies(crdataCat_tmnt['person_home_ownership'], drop_first=True).astype(int)
loan_intent = pd.get_dummies(crdataCat_tmnt['loan_intent'], drop_first=True).astype(int)

# Convert default_on_file to binary
crdataCat_tmnt['cb_person_default_on_file_binary'] = np.where(crdataCat_tmnt['cb_person_default_on_file'] == 'Y', 1, 0)

# Data scaling
numeric_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(crdataCat_tmnt[numeric_columns]), 
                         columns=numeric_columns, index=crdataCat_tmnt.index)

# Combine scaled and categorical data
scaled_data_combined = pd.concat([scaled_df, person_home_ownership, loan_intent], axis=1)
scaled_data_combined['cb_person_default_on_file'] = crdataCat_tmnt['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = crdataCat_tmnt['loan_status']

# Separate features and target
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status', axis=1)

features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
1,-1.090587,-1.078051,0.054432,-1.367192,0.034115,-0.655113,-0.939656,0,1,0,1,0,0,0,0,0
2,-0.441211,-1.078051,-0.938456,-0.65681,0.597575,3.767461,-0.692664,0,0,0,0,0,1,0,0,0
3,-0.765899,-0.018803,-0.19379,4.000141,1.366226,3.391072,-0.939656,0,0,1,0,0,1,0,0,0
4,-0.603555,-0.229137,0.799097,4.000141,1.053554,3.579267,-0.445671,0,0,1,0,0,1,0,0,1
5,-1.090587,-1.072366,-0.690234,-1.130398,-1.268682,0.756347,-0.939656,0,1,0,0,0,0,0,1,0


## SMOTE - Synthetic Minority Over-Sampling technique

In [4]:
smote= SMOTE()
balanced_features, balanced_target = smote.fit_resample(features, target)
print ("Shape of Balanced target:", balanced_target.shape)
print("Class distribution:")
print(pd.Series(balanced_target).value_counts())

Shape of Balanced target: (49692,)
Class distribution:
loan_status
0    24846
1    24846
Name: count, dtype: int64


## Traditional Models Training

In [17]:
"""LR Model"""

x_train, x_test, y_train, y_test = train_test_split (balanced_features, balanced_target, test_size=0.20, random_state=42)
logit = LogisticRegression()
logit.fit(x_train, y_train)
print(logit.score(x_train,y_train))
logit_prediction = logit.predict(x_test)
print(classification_report(y_test, logit_prediction))

features_imp_logit = pd.DataFrame ({'features' :balanced_features.columns,'logit_imp' : logit.coef_[0]})

0.7817774759137676
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      4995
           1       0.78      0.78      0.78      4944

    accuracy                           0.78      9939
   macro avg       0.78      0.78      0.78      9939
weighted avg       0.78      0.78      0.78      9939



In [22]:
"""RF Model"""
rf= RandomForestClassifier ()
rf.fit (x_train, y_train)
print(rf.score(x_train, y_train))
rf_prediction = rf.predict(x_test)
print(classification_report(y_test,rf_prediction))

features_imp_rf = pd.DataFrame ({'features' :balanced_features.columns,'rf_imp' : rf.feature_importances_})

1.0
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      4995
           1       0.97      0.91      0.93      4944

    accuracy                           0.94      9939
   macro avg       0.94      0.94      0.94      9939
weighted avg       0.94      0.94      0.94      9939



In [25]:
""" XG Boost"""

xgb_model = XGBClassifier(tree_method = 'exact')
#model.fit(x,y.values.ravel())
xgb_model.fit(x_train,y_train.values.ravel())
print (xgb_model.score(x_train,y_train.values.ravel()))
xgb_prediction = xgb_model.predict(x_test)
print (classification_report(y_test,xgb_prediction))

features_imp_xgb = pd.DataFrame ({'features' :balanced_features.columns,'xgb_imp' : xgb_model.feature_importances_})

0.9652604834855231
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      4995
           1       0.98      0.91      0.95      4944

    accuracy                           0.95      9939
   macro avg       0.95      0.95      0.95      9939
weighted avg       0.95      0.95      0.95      9939



In [26]:
features_imp=pd.concat([features_imp_logit,features_imp_rf,features_imp_xgb],axis=1)
features_imp

Unnamed: 0,features,logit_imp,features.1,rf_imp,features.2,xgb_imp
0,person_age,-0.080358,person_age,0.06229,person_age,0.037575
1,person_income,0.057227,person_income,0.146226,person_income,0.038054
2,person_emp_length,-0.047475,person_emp_length,0.075343,person_emp_length,0.053056
3,loan_amnt,-0.674594,loan_amnt,0.078558,loan_amnt,0.008737
4,loan_int_rate,0.986838,loan_int_rate,0.207818,loan_int_rate,0.073644
5,loan_percent_income,1.391351,loan_percent_income,0.207222,loan_percent_income,0.12625
6,cb_person_cred_hist_length,0.028736,cb_person_cred_hist_length,0.063035,cb_person_cred_hist_length,0.068502
7,OTHER,-0.57889,OTHER,0.000492,OTHER,0.00759
8,OWN,-2.110486,OWN,0.020514,OWN,0.180782
9,RENT,0.454747,RENT,0.04728,RENT,0.11117


In [27]:
""" Model Comparison"""

logit_accuracy = accuracy_score(y_test, logit_prediction)
logit_fscore = f1_score(y_test, logit_prediction)
logit_recall = recall_score(y_test, logit_prediction)
logit_precision = precision_score(y_test, logit_prediction)
logit_confusion = confusion_matrix(y_test, logit_prediction)


rf_accuracy = accuracy_score(y_test, rf_prediction)
rf_fscore = f1_score(y_test, rf_prediction)
rf_recall = recall_score(y_test, rf_prediction)
rf_precision = precision_score(y_test, rf_prediction)
rf_confusion = confusion_matrix(y_test, rf_prediction)

xgb_model_accuracy = accuracy_score(y_test, xgb_prediction)
xgb_model_fscore = f1_score(y_test, xgb_prediction)
xgb_model_recall = recall_score(y_test, xgb_prediction)
xgb_model_precision = precision_score(y_test, xgb_prediction)
xgb_model_confusion = confusion_matrix(y_test, xgb_prediction)

data_comparison = [[logit_accuracy,logit_fscore,logit_recall,logit_precision],
                  [rf_accuracy,rf_fscore,rf_recall,rf_precision],
                  [xgb_model_accuracy,xgb_model_fscore,xgb_model_recall,xgb_model_precision]]

comparison = pd.DataFrame(data_comparison,index= ['Logistic regression','Random Forest','XGboost'],
                                 columns=['Accuracy','f1_score','Recall','Precision'])


print ('Model comparison, lr except ')
comparison

Model comparison, lr except 


Unnamed: 0,Accuracy,f1_score,Recall,Precision
Logistic regression,0.779757,0.778553,0.778317,0.77879
Random Forest,0.937116,0.93478,0.905947,0.96551
XGboost,0.947983,0.945733,0.911206,0.982981
