In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Define number of samples
num_samples = 1000

# Generate features
credit_scores = np.random.randint(300, 850, num_samples)
incomes = np.random.randint(20000, 150000, num_samples)
loan_amounts = np.random.randint(1000, 50000, num_samples)
loan_income_ratios = loan_amounts / incomes

# Generate labels (0 for non-default, 1 for default)
# For simplicity, let's assume a simple rule: if loan_income_ratio > 0.3, default (1), else non-default (0)
labels = (loan_income_ratios > 0.3).astype(int)

# Introduce noise to labels to simulate uncertainty
noise = np.random.binomial(1, 0.1, num_samples)  # 10% noise
noisy_labels = np.abs(labels - noise)  # Flip labels with noise

# Create DataFrame for in-distribution data
data = pd.DataFrame({
    'credit_score': credit_scores,
    'income': incomes,
    'loan_income_ratio': loan_income_ratios,
    'loan_status': noisy_labels
})

# Generate out-of-distribution data
ood_num_samples = 200
ood_credit_scores = np.random.randint(100, 300, ood_num_samples)  # Unusually low credit scores
ood_incomes = np.random.randint(5000, 20000, ood_num_samples)  # Unusually low incomes
ood_loan_amounts = np.random.randint(50000, 100000, ood_num_samples)  # Unusually high loan amounts
ood_loan_income_ratios = ood_loan_amounts / ood_incomes

# Generate labels for OOD data (randomly for testing purposes)
ood_labels = np.random.randint(0, 2, ood_num_samples)

# Create DataFrame for OOD data
ood_data = pd.DataFrame({
    'credit_score': ood_credit_scores,
    'income': ood_incomes,
    'loan_income_ratio': ood_loan_income_ratios,
    'loan_status': ood_labels
})

# Save in-distribution and OOD data to CSV
data.to_csv('data/dummy_p2p_lending_dataset.csv', index=False)
ood_data.to_csv('data/dummy_p2p_lending_ood_dataset.csv', index=False)

print("In-distribution dataset created and saved to 'dummy_p2p_lending_dataset.csv'")
print("Out-of-distribution dataset created and saved to 'dummy_p2p_lending_ood_dataset.csv'")



In-distribution dataset created and saved to 'dummy_p2p_lending_dataset.csv'
Out-of-distribution dataset created and saved to 'dummy_p2p_lending_ood_dataset.csv'


In [2]:
# set cwd to ..
import os
os.chdir("..")
print(os.getcwd())

/Users/sondresorbye/Desktop/Pre-project-transformer


In [7]:


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from p2p_lending.utils.dataset import (
    create_dataset_with_embeddings,
    normalize,
    oversample_minority_class,
    split_data,
)

from p2p_lending.models.deep_feed_forward_model import DeepFeedForwardModel

# Load datasets
data = pd.read_csv('p2p_lending/data/dummy_p2p_lending_dataset.csv')
ood_data = pd.read_csv('p2p_lending/data/dummy_p2p_lending_ood_dataset.csv')

data, ood_data, _ = normalize(data, ood_data, ood_data, ['credit_score', 'income', 'loan_income_ratio'])

# Preprocess data
X = data[['credit_score', 'income', 'loan_income_ratio']].values
y = data['loan_status'].values
X_ood = ood_data[['credit_score', 'income', 'loan_income_ratio']].values
y_ood = ood_data['loan_status'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)
X_ood = torch.tensor(X_ood, dtype=torch.float32)
y_ood = torch.tensor(y_ood, dtype=torch.float32).unsqueeze(1)

# Define neural network model with MC Dropout
class MCDropoutModel(nn.Module):
    def __init__(self):
        super(MCDropoutModel, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

model = DeepFeedForwardModel(X_train.shape[1], 1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 50
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)

model.train()
for epoch in range(num_epochs):
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Function to get MC Dropout predictions
def mc_dropout_predictions(model, X, n_samples=100):
    model.train()  # Ensure dropout is enabled
    predictions = np.array([model(X).detach().numpy() for _ in range(n_samples)])
    return predictions

# Evaluate model on in-distribution data
model.eval()
with torch.no_grad():
    y_pred = model(X_test).numpy().flatten()
auc = roc_auc_score(y_test.numpy(), y_pred)
print(f"In-distribution AUC: {auc:.4f}")

# Evaluate model on OOD data
with torch.no_grad():
    y_pred_ood = model(X_ood).numpy().flatten()
auc_ood = roc_auc_score(y_ood.numpy(), y_pred_ood)
print(f"OOD AUC: {auc_ood:.4f}")

# Calculate uncertainty using MC Dropout
n_samples = 100
mc_preds = mc_dropout_predictions(model, X_test, n_samples)
mc_preds_ood = mc_dropout_predictions(model, X_ood, n_samples)

# Calculate epistemic uncertainty (variance of predictions)
epistemic_uncertainty = np.var(mc_preds, axis=0)
epistemic_uncertainty_ood = np.var(mc_preds_ood, axis=0)

# Convert pred to binary
#y_pred = (y_pred > 0.5).astype(int)
#y_pred_ood = (y_pred_ood > 0.5).astype(int)

# Calculate error
errors = np.abs(y_test.squeeze().numpy() - y_pred)
errors_ood = np.abs(y_ood.squeeze().numpy() - y_pred_ood)

epistemic_uncertainty = epistemic_uncertainty.ravel()
epistemic_uncertainty_ood = epistemic_uncertainty_ood.ravel()

print("Average epistemic uncertainty (in-distribution):", np.mean(epistemic_uncertainty))
print("Average epistemic uncertainty (OOD):", np.mean(epistemic_uncertainty_ood))

# Calculate correlation between uncertainty and error
epistemic_error_correlation = np.corrcoef(epistemic_uncertainty, errors)[0, 1]
epistemic_error_correlation_ood = np.corrcoef(epistemic_uncertainty_ood, errors_ood)[0, 1]

print(f"Epistemic error correlation (in-distribution): {epistemic_error_correlation:.4f}")
print(f"Epistemic error correlation (OOD): {epistemic_error_correlation_ood:.4f}")



Epoch [10/50], Loss: 0.2282
Epoch [20/50], Loss: 0.1438
Epoch [30/50], Loss: 0.1593
Epoch [40/50], Loss: 0.1781
Epoch [50/50], Loss: 0.1432
In-distribution AUC: 0.8608
OOD AUC: 0.4977
Average epistemic uncertainty (in-distribution): 0.016440991
Average epistemic uncertainty (OOD): 1.0582935
Epistemic error correlation (in-distribution): 0.0631
Epistemic error correlation (OOD): 0.7320


# Results

## Without sigmoid

Epoch [10/50], Loss: 0.2237
Epoch [20/50], Loss: 0.1686
Epoch [30/50], Loss: 0.1686
Epoch [40/50], Loss: 0.1707
Epoch [50/50], Loss: 0.0411
In-distribution AUC: 0.8678
OOD AUC: 0.4972
Average epistemic uncertainty (in-distribution): 0.01743747
Average epistemic uncertainty (OOD): 1.2790753
Epistemic error correlation (in-distribution): 0.0616
Epistemic error correlation (OOD): 0.7923

## With sigmoid

In-distribution AUC: 0.8823
OOD AUC: 0.5063
Average epistemic uncertainty (in-distribution): 0.007772154
Average epistemic uncertainty (OOD): 9.5380725e-05
Epistemic error correlation (in-distribution): 0.3135
Epistemic error correlation (OOD): -0.0842