In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
import joblib


In [2]:
""" Standard Preprocessing
"""
CreditRisk =pd.read_csv('credit_risk_dataset.csv')

# Filter age and employment length
crData = CreditRisk[(CreditRisk['person_age'] <= 70) & (CreditRisk['person_emp_length'] < 47)].copy()

# Fill missing values and drop 'loan_grade' column
crData.loc[:, 'loan_int_rate'] = crData['loan_int_rate'].fillna(crData['loan_int_rate'].median())
crDataCopy = crData.drop('loan_grade', axis=1)

display(crDataCopy.shape)
crDataCopy.head()

(31671, 11)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


## Categorical Features Treament

In [3]:
crdataCat_tmnt = crDataCopy.copy()
person_home_ownership = pd.get_dummies(crdataCat_tmnt['person_home_ownership'], drop_first=True).astype(int)
loan_intent = pd.get_dummies(crdataCat_tmnt['loan_intent'], drop_first=True).astype(int)

# Convert default_on_file to binary
crdataCat_tmnt['cb_person_default_on_file_binary'] = np.where(crdataCat_tmnt['cb_person_default_on_file'] == 'Y', 1, 0)

# Data scaling
numeric_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(crdataCat_tmnt[numeric_columns]), 
                         columns=numeric_columns, index=crdataCat_tmnt.index)

# Combine scaled and categorical data
scaled_data_combined = pd.concat([scaled_df, person_home_ownership, loan_intent], axis=1)
scaled_data_combined['cb_person_default_on_file'] = crdataCat_tmnt['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = crdataCat_tmnt['loan_status']

# Separate features and target
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status', axis=1)

features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
1,-1.090587,-1.078051,0.054432,-1.367192,0.034115,-0.655113,-0.939656,0,1,0,1,0,0,0,0,0
2,-0.441211,-1.078051,-0.938456,-0.65681,0.597575,3.767461,-0.692664,0,0,0,0,0,1,0,0,0
3,-0.765899,-0.018803,-0.19379,4.000141,1.366226,3.391072,-0.939656,0,0,1,0,0,1,0,0,0
4,-0.603555,-0.229137,0.799097,4.000141,1.053554,3.579267,-0.445671,0,0,1,0,0,1,0,0,1
5,-1.090587,-1.072366,-0.690234,-1.130398,-1.268682,0.756347,-0.939656,0,1,0,0,0,0,0,1,0


## SMOTE - Synthetic Minority Over-Sampling technique

In [4]:
smote= SMOTE()
balanced_features, balanced_target = smote.fit_resample(features, target)
print ("Shape of Balanced target:", balanced_target.shape)
print("Class distribution:")
print(pd.Series(balanced_target).value_counts())

Shape of Balanced target: (49692,)
Class distribution:
loan_status
0    24846
1    24846
Name: count, dtype: int64


## Traditional Models Training

In [8]:
"""LR Model"""
x_train, x_test, y_train, y_test = train_test_split (balanced_features, balanced_target, test_size=0.20, random_state=42)
logit = LogisticRegression()
logit.fit(x_train, y_train)
print(logit.score(x_train,y_train))
logit_prediction = logit.predict(x_test)
features_imp_logit = pd.DataFrame ({'features' :balanced_features.columns,'logit_imp' : logit.coef_[0]})


"""RF Model"""
rf= RandomForestClassifier ()
rf.fit (x_train, y_train)
print(rf.score(x_train, y_train))
rf_prediction = rf.predict(x_test)
features_imp_rf = pd.DataFrame ({'features' :balanced_features.columns,'rf_imp' : rf.feature_importances_})


""" XG Boost"""
xgb_model = XGBClassifier(tree_method = 'exact')
#model.fit(x,y.values.ravel())
xgb_model.fit(x_train,y_train.values.ravel())
print (xgb_model.score(x_train,y_train.values.ravel()))
xgb_prediction = xgb_model.predict(x_test)
features_imp_xgb = pd.DataFrame ({'features' :balanced_features.columns,'xgb_imp' : xgb_model.feature_importances_})

0.7824315146026715
1.0
0.9659648328428043


In [6]:
features_imp=pd.concat([features_imp_logit,features_imp_rf,features_imp_xgb],axis=1)
features_imp

Unnamed: 0,features,logit_imp,features.1,rf_imp,features.2,xgb_imp
0,person_age,-0.079732,person_age,0.061176,person_age,0.036762
1,person_income,0.039851,person_income,0.142826,person_income,0.038239
2,person_emp_length,-0.042514,person_emp_length,0.075281,person_emp_length,0.056565
3,loan_amnt,-0.659692,loan_amnt,0.077775,loan_amnt,0.01093
4,loan_int_rate,0.975564,loan_int_rate,0.207327,loan_int_rate,0.070343
5,loan_percent_income,1.38907,loan_percent_income,0.213671,loan_percent_income,0.137711
6,cb_person_cred_hist_length,0.037321,cb_person_cred_hist_length,0.06133,cb_person_cred_hist_length,0.067886
7,OTHER,-0.706059,OTHER,0.000452,OTHER,0.008805
8,OWN,-2.056663,OWN,0.020302,OWN,0.184026
9,RENT,0.479685,RENT,0.04959,RENT,0.104933


In [7]:
""" Model Comparison"""
def get_metrics(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred)
    }

models = {
    'Logistic regression': logit_prediction,
    'Random Forest': rf_prediction,
    'XGBoost': xgb_prediction
}

comparison = pd.DataFrame({name: get_metrics(y_test, pred) for name, pred in models.items()}).T

print('Model comparison:')
print(comparison)

# If you need confusion matrices:
confusion_matrices = {name: confusion_matrix(y_test, pred) for name, pred in models.items()}

Model comparison:
                     Accuracy  f1_score    Recall  Precision
Logistic regression  0.782976  0.782363  0.784183   0.780552
Random Forest        0.937720  0.935487  0.907767   0.964954
XGBoost              0.947882  0.945668  0.911812   0.982135
