In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from typing import Dict, Any, Union

class LoanDefaultPredictor:
    """
    A machine learning pipeline for predicting loan defaults and calculating expected loss.

    This class handles data loading, preprocessing, model training (with either Logistic Regression
    or Random Forest), prediction, and expected loss calculation.

    Attributes:
        data (pd.DataFrame): The loaded loan data
        model (Pipeline): The trained scikit-learn pipeline
        feature_columns (list): List of feature columns used for modeling
        X_train (pd.DataFrame): Training features
        X_test (pd.DataFrame): Test features
        y_train (pd.Series): Training target
        y_test (pd.Series): Test target
    """

    def __init__(self, data_path: str) -> None:
        """
        Initialize the LoanDefaultPredictor with data from the given path.

        Args:
            data_path (str): Path to the CSV file containing loan data
        """
        self.data = pd.read_csv(data_path)
        self.model = None
        self.feature_columns = ['credit_lines_outstanding', 'loan_amt_outstanding',
                              'total_debt_outstanding', 'income', 'years_employed', 'fico_score']
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def preprocess_data(self) -> None:
        """
        Preprocess the data by splitting into features and target, then creating train/test splits.

        The split is stratified on the target variable to maintain class distribution.
        Sets attributes X_train, X_test, y_train, y_test.
        """
        X = self.data[self.feature_columns]
        y = self.data['default']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=123, stratify=y)

    def train_model(self, model_type: str = 'random_forest') -> None:
        """
        Train either a Logistic Regression or Random Forest model.

        Args:
            model_type (str): Type of model to train ('logistic_regression' or 'random_forest')

        Raises:
            ValueError: If an invalid model_type is provided
        """
        if model_type not in ['logistic_regression', 'random_forest']:
            raise ValueError("model_type must be either 'logistic_regression' or 'random_forest'")

        if model_type == 'logistic_regression':
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', LogisticRegression(random_state=123))
            ])
        else:  # random forest
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(random_state=123))
            ])

        pipeline.fit(self.X_train, self.y_train)
        self.model = pipeline

        # Evaluate the model
        y_pred = self.model.predict(self.X_test)
        y_pred_proba = self.model.predict_proba(self.X_test)[:, 1]

        print("Model Evaluation Metrics:")
        print(classification_report(self.y_test, y_pred))
        print(f"ROC AUC Score: {roc_auc_score(self.y_test, y_pred_proba):.4f}")

    def predict_default_probability(self, loan_features: Dict[str, Union[int, float]]) -> float:
        """
        Predict the probability of default for a given loan.

        Args:
            loan_features (dict): Dictionary containing loan features

        Returns:
            float: Probability of default between 0 and 1

        Raises:
            ValueError: If model hasn't been trained or features are missing
        """
        if not self.model:
            raise ValueError("Model not trained. Call train_model() first.")

        try:
            features_df = pd.DataFrame([loan_features])[self.feature_columns]
            return self.model.predict_proba(features_df)[0, 1]
        except KeyError as e:
            raise ValueError(f"Missing required feature: {str(e)}")

    def calculate_expected_loss(self, loan_features: Dict[str, Union[int, float]],
                              recovery_rate: float = 0.1) -> Dict[str, float]:
        """
        Calculate expected loss for a loan using Probability of Default (PD),
        Exposure at Default (EAD), and Loss Given Default (LGD).

        Formula: Expected Loss = PD × EAD × LGD

        Args:
            loan_features (dict): Dictionary containing loan features
            recovery_rate (float): Percentage of loan recoverable after default (0 to 1)

        Returns:
            dict: Dictionary containing:
                - Probability of Default (PD)
                - Exposure at Default (EAD)
                - Loss Given Default (LGD)
                - Expected Loss

        Raises:
            ValueError: If recovery_rate is not between 0 and 1
        """
        if not 0 <= recovery_rate <= 1:
            raise ValueError("recovery_rate must be between 0 and 1")

        pd_prob = self.predict_default_probability(loan_features)
        ead = loan_features['loan_amt_outstanding']
        lgd = 1 - recovery_rate
        expected_loss = pd_prob * ead * lgd

        return {
            'Probability of Default': pd_prob,
            'Exposure at Default (EAD)': ead,
            'Loss Given Default (LGD)': lgd,
            'Expected Loss': expected_loss
        }


if __name__ == "__main__":
    # Initialize and train the model
    predictor = LoanDefaultPredictor('loan_data.csv')
    predictor.preprocess_data()
    predictor.train_model()

    # Sample Case for Loan
    example_loan = {
        'credit_lines_outstanding': 2,
        'loan_amt_outstanding': 5000,
        'total_debt_outstanding': 15000,
        'income': 60000,
        'years_employed': 3,
        'fico_score': 650
    }

    # Calculate Expected loss
    result = predictor.calculate_expected_loss(example_loan)
    print("\nExpected Loss Calculation:")
    for key, value in result.items():
        print(f"{key}: {value:.4f}")

Model Evaluation Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1630
           1       0.99      0.99      0.99       370

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

ROC AUC Score: 0.9998

Expected Loss Calculation:
Probability of Default: 0.2200
Exposure at Default (EAD): 5000.0000
Loss Given Default (LGD): 0.9000
Expected Loss: 990.0000
