In [None]:
# PD modeling and Expected Loss estimation pipeline

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score , average_precision_score , log_loss ,brier_score_loss




In [None]:
#loading the data
df = pd.read_csv('Loan_Data.csv')
df.head()
target = 'default'
numeric_columns = df.columns.values[1:-1]
x = df[numeric_columns].copy()
y = df['default'].astype(int).values
x_train , x_test , y_train , y_test =train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

# 3) Preprocessing for numeric features
# SimpleImputer(strategy='median'): fills missing numeric values with median
# StandardScaler(): zero-mean, unit-variance scaling, helpful for logistic regression
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 4) ColumnTransformer to apply numeric_transformer to numeric_features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns)
    ]
)

# 5) Logistic Regression model
# class_weight='balanced' helps when default=1 is rare
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')

# 6) Build the full pipeline
log_reg_pipeline = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', log_reg)
])

# This pipeline supports .fit(X_train, y_train) and .predict_proba(X_valid)


# 1) Train the model
log_reg_pipeline.fit(x_train, y_train)

# 2) Predict probabilities on validation set
val_proba = log_reg_pipeline.predict_proba(x_test)[:, 1]

# 3) Evaluate
roc = roc_auc_score(y_test, val_proba)
pr = average_precision_score(y_test, val_proba)
ll = log_loss(y_test, val_proba )
br = brier_score_loss(y_test, val_proba)

print(f"Validation ROC AUC: {roc:.3f}")
print(f"Validation PR AUC: {pr:.3f}")
print(f"Validation Log Loss: {ll:.4f}")
print(f"Validation Brier Score: {br:.4f}")

df.columns

In [None]:
# Decision Tree 

# Create the decision tree with some regularization to avoid overfitting
tree_clf = DecisionTreeClassifier(
    max_depth=6,           # limit depth
    min_samples_leaf=50,   # minimum samples per leaf
    class_weight='balanced',
    random_state=42
)

tree_pipeline = Pipeline(steps=[
    ('prep', preprocessor),   # reuse the same preprocessor from Step 2
    ('model', tree_clf)
])

# Fit
tree_pipeline.fit(x_train, y_train)

# Predict probabilities for the positive class (default=1)
val_proba_tree = tree_pipeline.predict_proba(x_test)[:, 1]

# Evaluate
roc_t = roc_auc_score(y_test, val_proba_tree)
pr_t = average_precision_score(y_test, val_proba_tree)
ll_t = log_loss(y_test, val_proba_tree)
br_t = brier_score_loss(y_test, val_proba_tree)

print(f"Tree - Validation ROC AUC: {roc_t:.3f}")
print(f"Tree - Validation PR AUC: {pr_t:.3f}")
print(f"Tree - Validation Log Loss: {ll_t:.4f}")
print(f"Tree - Validation Brier Score: {br_t:.4f}")


In [None]:
# defing the fnal loss fucntion 


def compute_expected_loss(csv_path='Loan_Data.csv', recovery_rate=0.10):
    """
    Trains a simple PD model (logistic regression) on the dataset and returns a DataFrame
    with Probability of Default (PD) and Expected Loss (EL) for each row.
    EL = PD * (1 - recovery_rate) * loanamtoutstanding
    """
    # 1) Load
    df = pd.read_csv(csv_path)

    # 2) Standardize column names and choose features/target
    df.columns = [c.strip().lower() for c in df.columns]
    target = 'default'
    feature_cols = df.columns.values[1:-1]
    feature_cols = [c for c in feature_cols if c in df.columns]

    # Keep only rows with target
    df = df.dropna(subset=[target])

    # X/y
    X = df[feature_cols].copy()
    y = df[target].astype(int).values

    # 3) Preprocess + baseline logistic regression
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, feature_cols)]
    )

    log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')

    pipe = Pipeline(steps=[
        ('prep', preprocessor),
        ('model', log_reg)
    ])

    # 4) Simple train/validation split and fit
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    pipe.fit(X_train, y_train)

    # 5) Predict PD on the full dataset (or use only validation/test if preferred)
    pd_est = pipe.predict_proba(X)[:, 1]

    # 6) Compute Expected Loss
    lgd = 1.0 - float(recovery_rate)
    ead = df['loan_amt_outstanding'].astype(float).fillna(0.0)
    el = pd_est * lgd * ead

    # 7) Return results with original identifiers if present
    out = df.copy()
    out['PD'] = pd_est
    out['Expected_Loss'] = el
    return out[['PD', 'Expected_Loss'] + [c for c in df.columns if c not in ['pd','expected_loss']]]

# Example:
# results = compute_expected_loss('Loan-Data.csv', recovery_rate=0.10)
# print(results.head())
compute_expected_loss()