In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score


In [16]:
loan_data = pd.read_csv('Task 3 and 4_Loan_Data.csv')
loan_data.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


customer_id → Unique ID for each borrower.

credit_lines_outstanding → Number of active credit lines.

loan_amt_outstanding → Total loan amount currently owed.

total_debt_outstanding → Total outstanding debt.

income → Annual income of the borrower.

years_employed → Years of employment.

fico_score → Borrower’s credit score.

default → Target variable (1 = Defaulted, 0 = Not Defaulted).

In [17]:
loan_data = loan_data.drop(columns=['customer_id'])
loan_data = loan_data.dropna()

In [18]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   credit_lines_outstanding  10000 non-null  int64  
 1   loan_amt_outstanding      10000 non-null  float64
 2   total_debt_outstanding    10000 non-null  float64
 3   income                    10000 non-null  float64
 4   years_employed            10000 non-null  int64  
 5   fico_score                10000 non-null  int64  
 6   default                   10000 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 547.0 KB


In [20]:
X = loan_data.drop(columns=['default'])
y = loan_data['default']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [21]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [55]:
intercept = log_reg.intercept_
coefficients = log_reg.coef_
print("Intercept:", intercept)
print("Coefficients:", coefficients)

Intercept: [-13.35054145]
Coefficients: [[ 8.93632029  0.17156274  3.73337695 -2.36836176 -2.89272893 -1.2082978 ]]


In [25]:
y_pred = log_reg.predict(X_test)

# Probability of default
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]


In [33]:
# Model performance metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_prob)


import joblib
model_path = "/Users/minhnguyetnguyen/Documents/project/JP/logistic_regression_model.pkl"
scaler_path = "/Users/minhnguyetnguyen/Documents/project/JP/scaler.pkl"
joblib.dump(log_reg, model_path)
joblib.dump(scaler, scaler_path)

accuracy, auc

(0.996, np.float64(0.9999652110990509))

In [58]:
# Step 2: Additional Model Validation Metrics

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Calculate additional metrics
precision = precision_score(y_test, y_pred)  # Precision = TP / (TP + FP)
recall = recall_score(y_test, y_pred)  # Recall = TP / (TP + FN)
f1 = f1_score(y_test, y_pred)  # F1 Score = 2 * (Precision * Recall) / (Precision + Recall)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
{
    "Accuracy": round(accuracy, 4),
    "AUC": round(auc, 4),
    "Precision": round(precision, 4),
    "Recall": round(recall, 2),
    "F1 Score": round(f1, 2),
    "Confusion Matrix": conf_matrix.tolist() 
}


{'Accuracy': 0.996,
 'AUC': np.float64(1.0),
 'Precision': 0.9971,
 'Recall': 0.98,
 'F1 Score': 0.99,
 'Confusion Matrix': [[1651, 1], [7, 341]]}

In [35]:
# Calculate the Probability of Default
log_reg_model = joblib.load(model_path)
scaler_model = joblib.load(scaler_path)


def predict_probability_of_default(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, 
                                   income, years_employed, fico_score):
    """
    Predicts the probability of default (PD) for a given borrower's loan details.

    Parameters:
    - credit_lines_outstanding (int): Number of active credit lines
    - loan_amt_outstanding (float): Total loan amount currently owed
    - total_debt_outstanding (float): Total outstanding debt
    - income (float): Annual income of the borrower
    - years_employed (int): Years of employment
    - fico_score (int): Borrower's credit score

    Returns:
    - PD (float): Probability of default (between 0 and 1)
    """
    input_data = pd.DataFrame([[credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, 
                                income, years_employed, fico_score]], 
                              columns=X.columns)
    # Scale
    input_scaled = scaler_model.transform(input_data)

    # Predict PD
    pd_prob = log_reg_model.predict_proba(input_scaled)[:, 1][0]

    return round(pd_prob, 4)

In [44]:
# test
sample_input = {
    "credit_lines_outstanding": 4,
    "loan_amt_outstanding": 4000,
    "total_debt_outstanding": 12000,
    "income": 60000,
    "years_employed": 8,
    "fico_score": 700
}
pd_value = predict_probability_of_default(**sample_input)

pd_value

np.float64(0.0071)

In [45]:
# Expected loss: Expected monetary loss due to default
def calculate_expected_loss(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, 
                            income, years_employed, fico_score, recovery_rate=0.10):
    pd_prob = predict_probability_of_default(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, 
                         income, years_employed, fico_score)
    # Calculate Loss Given Default (LGD)
    lgd = 1 - recovery_rate

    # Calculate Expected Loss
    expected_loss = pd_prob * loan_amt_outstanding * lgd
    
    return round(pd_prob, 4), round(expected_loss, 2)

In [47]:
# Borrower profile to test the function
example_borrower = {
    "credit_lines_outstanding": 4,
    "loan_amt_outstanding": 4000,
    "total_debt_outstanding": 120000,
    "income": 60000,
    "years_employed": 8,
    "fico_score": 700
}

# Predict PD and Expected Loss for the example borrower
pd_value, expected_loss_value = calculate_expected_loss(**example_borrower)

# Display result
pd_value, expected_loss_value

(np.float64(1.0), np.float64(3600.0))

In [51]:
# Random Forest


from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)
y_pred_prob_rf = rf_model.predict_proba(X_test)[:, 1]  # Probability of default

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_prob_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

# Save the Random Forest model for later use
rf_model_path = "/Users/minhnguyetnguyen/Documents/project/JP/random_forest_model.pkl"
joblib.dump(rf_model, rf_model_path)

# Display model performance for Random Forest
{
    "Accuracy": round(accuracy_rf, 4),
    "AUC (Area Under Curve)": round(auc_rf, 4),
    "Precision": round(precision_rf, 4),
    "Recall": round(recall_rf, 4),
    "F1 Score": round(f1_rf, 4),
    "Confusion Matrix": conf_matrix_rf.tolist() 
}




{'Accuracy': 0.9945,
 'AUC (Area Under Curve)': np.float64(0.9997),
 'Precision': 0.9913,
 'Recall': 0.977,
 'F1 Score': 0.9841,
 'Confusion Matrix': [[1649, 3], [8, 340]]}

In [None]:
sample_input = {
    "credit_lines_outstanding": 5,
    "loan_amt_outstanding": 50000,
    "total_debt_outstanding": 150000,
    "income": 70000,
    "years_employed": 10,
    "fico_score": 720
}

# Predict the probability of default and expected loss for the test case
pd_value, expected_loss_value = predict_expected_loss(**sample_input)

# Display results
pd_value, expected_loss_value