In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv('Task 3 and 4_Loan_Data.csv')

# Strip spaces from column names if necessary
data.columns = data.columns.str.strip()

# Define features and target variable based on the data file
features = ['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding',
            'income', 'years_employed', 'fico_score']
target = 'default'

# Check if all features and target are present in the DataFrame
missing_features = [feature for feature in features if feature not in data.columns]
if missing_features:
    print(f"Missing features: {missing_features}")
else:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train a Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Predict probabilities on the test set
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    # Calculate expected loss function
    def expected_loss(probability_of_default, loan_amount, recovery_rate=0.1):
        return loan_amount * (1 - recovery_rate) * probability_of_default

    # Example usage: Calculate expected loss for each loan in the test set
    test_data_with_predictions = X_test.copy()
    test_data_with_predictions['Probability_of_Default'] = y_pred_proba
    test_data_with_predictions['Expected_Loss'] = test_data_with_predictions.apply(
        lambda row: expected_loss(row['Probability_of_Default'], row['loan_amt_outstanding']), axis=1)

    print(test_data_with_predictions[['Probability_of_Default', 'Expected_Loss']].head())

    # Evaluate model performance
    accuracy = accuracy_score(y_test, model.predict(X_test_scaled))
    conf_matrix = confusion_matrix(y_test, model.predict(X_test_scaled))
    class_report = classification_report(y_test, model.predict(X_test_scaled))

    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)

      Probability_of_Default  Expected_Loss
6252                    0.00       0.000000
4684                    0.15     785.835878
1731                    1.00    4808.873894
4742                    0.00       0.000000
4521                    0.00       0.000000
Accuracy: 0.9945
Confusion Matrix:
[[1649    3]
 [   8  340]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       0.99      0.98      0.98       348

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

