# Import Libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE


In [9]:
# Load the dataset
df = pd.read_csv("credit_risk_dataset.csv")  # Adjust path as needed
print(df.head())

   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  


In [10]:
# Fill missing values with median
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)


In [11]:
# Encode string categories into integers
label_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])


# Define Features (X) and Target (y)

In [12]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']


# Handle Imbalanced Classes (SMOTE)

In [13]:
# Standardize features before applying SMOTE
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balance the dataset using SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)


# Split into Train/Test Sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)


# Train and Evaluate Models 

In [15]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluation
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")



Training Random Forest...
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      7659
           1       0.98      0.91      0.94      7625

    accuracy                           0.94     15284
   macro avg       0.95      0.94      0.94     15284
weighted avg       0.95      0.94      0.94     15284

ROC-AUC Score: 0.9871

Training Gradient Boosting...
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      7659
           1       0.94      0.86      0.90      7625

    accuracy                           0.90     15284
   macro avg       0.91      0.90      0.90     15284
weighted avg       0.91      0.90      0.90     15284

ROC-AUC Score: 0.9637

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      7659
           1       0.99      0.92      0.95      7625

    accuracy                           0.95     15284
   macro avg       0.96      0.95      0.95     15284
weighted avg       0.96      0.95      0.95     15284

ROC-AUC Score: 0.9857
