<a href="https://colab.research.google.com/github/Lcocks/DS6050-DeepLearning/blob/main/German_credit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://colab.research.google.com/drive/1Z4h7a98YGhJ65L3_QsfEe0FOaZh8hTHT?usp=sharing

https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Part 1: Vanilla MLP Baseline for German Credit Risk Prediction
Dataset (UCI ML Repository):
- Name: Statlog (German Credit Data)
- Direct data file used here (space-separated):
  https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data
- Approx. size: 1,000 rows × 20 features + 1 label (the 21st column named `class`).
- Label encoding in the raw file: 1 = Good credit, 2 = Bad credit.
  We remap to: 0 = Good, 1 = Bad (binary classification with "Bad" as the positive class).

Caveats:
- This dataset mixes numeric and categorical (string-coded) features such as 'A11'.
  In this baseline, we use LabelEncoder per categorical column + StandardScaler on numerics.
  A stronger approach might use embeddings or target encoding for high-cardinality variables.

What you should expect:
- Because "Bad Credit" is the minority class (~30% in the full dataset), a vanilla model
  with a 0.5 threshold typically has *lower recall* on "Bad" (misses risky cases).
- Still, the AUC can look decent (e.g., ~0.78–0.80) because the model ranks many
  "Bad" cases higher than "Good" even if we choose a suboptimal decision threshold.

  1) Download the dataset if 'german.data' is not present.
  2) Train the MLP for a fixed number of epochs.
  3) Evaluate on a held-out test split and print AUC + classification report.
  4) Print an interpretation of the metrics and suggest next steps.

"""

# -------------------------
# 0. Imports
# -------------------------
import os
import urllib.request
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, classification_report, confusion_matrix, precision_recall_curve
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)
np.random.seed(42)

# -------------------------
# 2. Data Loading & Preprocessing
# -------------------------
# - If 'german.data' is missing, download from UCI.
# - Read as space-separated file with given column names.
# - Map label {1: Good, 2: Bad} -> {0: Good, 1: Bad}.
# - Identify numeric vs categorical columns.
# - Label-encode categoricals (per column) and standardize numerics (fit on train only).
# - Train/test split with stratification to maintain class ratio.
# - Return PyTorch tensors for model consumption.
def load_and_preprocess_data(file_path='german.data'):
    """Loads, preprocesses, and splits the German Credit dataset.

    Returns
    -------
    X_train_t : torch.FloatTensor, shape (n_train, d)
    X_test_t  : torch.FloatTensor, shape (n_test, d)
    y_train_t : torch.FloatTensor, shape (n_train,)
    y_test_t  : torch.FloatTensor, shape (n_test,)
    feature_names : list[str]
        Column names in the same order as the returned tensors.
    """
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
    if not os.path.exists(file_path):
        print("Downloading dataset from UCI...")
        urllib.request.urlretrieve(url, file_path)

    columns = [
        'checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount',
        'savings_status', 'employment', 'installment_commitment', 'personal_status',
        'other_parties', 'residence_since', 'property_magnitude', 'age',
        'other_payment_plans', 'housing', 'existing_credits', 'job', 'num_dependents',
        'own_telephone', 'foreign_worker', 'class'
    ]
    # The raw file is single-space separated, no header
    df = pd.read_csv(file_path, sep=' ', header=None, names=columns)

    # Map label to {0: Good, 1: Bad} so that 1 means "Bad" (positive class)
    df['class'] = df['class'].map({1: 0, 2: 1}).astype(int)

    # Separate features and target
    X = df.drop(columns=['class']).copy()
    y = df['class'].values

    # Identify numeric vs categorical by pandas dtype
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

    # Label-encode categoricals *per column*
    # NOTE: This is a baseline choice. For production, consider embeddings/target encoding.
    for col in categorical_cols:
        X[col] = LabelEncoder().fit_transform(X[col])

    # Train-test split (80/20) with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Standardize numeric columns (fit on train only)
    scaler = StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    feature_names = X.columns.tolist()  # preserve original order
    # Convert to tensors
    X_train_t = torch.FloatTensor(X_train[feature_names].values)
    X_test_t  = torch.FloatTensor(X_test[feature_names].values)
    y_train_t = torch.FloatTensor(y_train)
    y_test_t  = torch.FloatTensor(y_test)

    return X_train_t, X_test_t, y_train_t, y_test_t, feature_names
