In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score,accuracy_score, confusion_matrix, classification_report, confusion_matrix

In [13]:
# Load data
data = pd.read_csv('loan_data.csv')
data.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


Use LogisticRegression model.

In [14]:
# Scaling numerical features
scaler = StandardScaler()
features = ['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score']

scaled_data = data.copy()
scaled_data[features] = scaler.fit_transform(scaled_data[features])

# Split the scaled data
X_scaled = scaled_data[features]
y_scaled = scaled_data['default']
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [15]:
scaled_data

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,-0.837960,0.747096,-0.724848,0.398555,0.285425,-0.536770,0
1,7442532,2.029409,-1.548375,-0.073967,-2.161876,-1.629325,-1.080832,1
2,2256073,-0.837960,-0.560509,-1.009696,-0.207919,-0.352825,-0.586230,0
3,4885975,-0.837960,0.427045,-0.938184,0.215083,0.285425,-0.421363,0
4,4700614,-0.264486,-1.979733,-1.048780,-2.321314,0.923675,-0.108115,0
...,...,...,...,...,...,...,...,...
9995,3972488,-0.837960,-0.792238,-0.930336,-1.362562,0.285425,0.980009,0
9996,6184073,-0.264486,-0.009454,-0.492053,0.494719,2.200176,-0.371903,0
9997,6694516,0.308988,-0.753840,-0.589396,-1.586712,0.285425,-0.685150,0
9998,3942961,-0.837960,-0.612649,-1.158293,-0.952136,-1.629325,0.155673,0


In [16]:
# Model training
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train_scaled, y_train_scaled)

# Predict probabilities
probabilities_lr = model1.predict_proba(X_test_scaled)[:, 1] 

# Evaluate model
print("AUC-ROC:", roc_auc_score(y_test_scaled, probabilities_lr))
print("Classification Report:\n", classification_report(y_test_scaled, model1.predict(X_test_scaled)))


AUC-ROC: 0.9999652110990509
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       1.00      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000



In [28]:
# Function to calculate expected loss
def calculate_expected_loss(loan_amount, pd, recovery_rate=0.1):
    return loan_amount * pd * (1 - recovery_rate)

# Example usage
sample_borrower_data = data.iloc[1731:1732][features]
loan_amount = data.iloc[1731]['loan_amt_outstanding']
sample_borrower_data = scaler.transform(sample_borrower_data)  # Scale features
pd_sample = model1.predict_proba(sample_borrower_data)[:, 1][0]  # Probability of default
print("Probability of Default for the sample borrower:", pd_sample)

expected_loss = calculate_expected_loss(loan_amount, pd_sample)
print("Expected Loss for the sample loan:", expected_loss)

Probability of Default for the sample borrower: 0.9999999574861171
Expected Loss for the sample loan: 4808.873689056099




In [27]:
data.iloc[1731]

customer_id                 3.816784e+06
credit_lines_outstanding    5.000000e+00
loan_amt_outstanding        5.343193e+03
total_debt_outstanding      3.061460e+04
income                      8.630395e+04
years_employed              4.000000e+00
fico_score                  6.070000e+02
default                     1.000000e+00
Name: 1731, dtype: float64

2.Use RandomForest model.

In [19]:
# Handling missing values
data.fillna(data.median(), inplace=True)

# Separate features and target
X = data.drop('default', axis=1)
y = data['default']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Initialize and train the model
model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train, y_train)

# Predict probabilities
probabilities_rf = model2.predict_proba(X_test)[:, 1]

# Evaluate the model
auc_score = roc_auc_score(y_test, probabilities_rf)
print(f'AUC Score: {auc_score}')

AUC Score: 0.9996929879491248


In [25]:
def calculate_expected_loss(features, model, recovery_rate=0.1):
    probability_of_default = model.predict_proba([features])[0, 1]
    loan_amount = features['loan_amt_outstanding']
    expected_loss = loan_amount * (1 - recovery_rate) * probability_of_default
    return expected_loss

# Example use. Here, the borrower data is randomly picked from data
new_borrower_features = X_test.iloc[2]
expected_loss = calculate_expected_loss(new_borrower_features, model2)
print(f'Expected Loss: {expected_loss}')

Expected Loss: 4808.8738935




In [24]:
print(new_borrower_features)

customer_id                 3.816784e+06
credit_lines_outstanding    5.000000e+00
loan_amt_outstanding        5.343193e+03
total_debt_outstanding      3.061460e+04
income                      8.630395e+04
years_employed              4.000000e+00
fico_score                  6.070000e+02
Name: 1731, dtype: float64


In [29]:
# Logistic Regression Model
predictions_lr = model1.predict(X_test_scaled)

# RandomForest Model
predictions_rf = model2.predict(X_test)

# Evaluate models
print("Logistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, predictions_lr))
print("AUC-ROC:", roc_auc_score(y_test, probabilities_lr))
print("Classification Report:\n", classification_report(y_test, predictions_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions_lr))

print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, predictions_rf))
print("AUC-ROC:", roc_auc_score(y_test, probabilities_rf))
print("Classification Report:\n", classification_report(y_test, predictions_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions_rf))


Logistic Regression Metrics:
Accuracy: 0.996
AUC-ROC: 0.9999652110990509
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       1.00      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
 [[1651    1]
 [   7  341]]

Random Forest Metrics:
Accuracy: 0.9955
AUC-ROC: 0.9996929879491248
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       0.99      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
 [[1650    2]
 [   7  341]]


Overall Performance: Both models are performing well. Given the high AUC-ROC scores, both models are excellent at classifying and separating the non-defaults from defaults.

Error Types: Both models have very few errors, but Logistic Regression has one fewer false positive and the same number of false negatives as Random Forest. This might make Logistic Regression slightly more appealing if minimizing false positives is critical.