In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

# 1. Load the Dataset
df = pd.read_csv('GermanCredit.csv')

# 2. Data Preprocessing & Feature Engineering
# Separate features and target
X = df.drop('credit_risk', axis=1)
y = df['credit_risk']

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Convert categorical variables into dummy/indicator variables (One-Hot Encoding)
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 3. Split the Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Feature Scaling (Important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

# 5. Initialize and Train Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# 6. Evaluate Models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 30)

--- Logistic Regression ---
Accuracy: 0.7200
ROC-AUC: 0.7567
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.47      0.50        60
           1       0.78      0.83      0.81       140

    accuracy                           0.72       200
   macro avg       0.66      0.65      0.65       200
weighted avg       0.71      0.72      0.71       200

------------------------------
--- Decision Tree ---
Accuracy: 0.6850
ROC-AUC: 0.6369
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.52      0.50        60
           1       0.79      0.76      0.77       140

    accuracy                           0.69       200
   macro avg       0.63      0.64      0.63       200
weighted avg       0.69      0.69      0.69       200

------------------------------
--- Random Forest ---
Accuracy: 0.7300
ROC-AUC: 0.7369
Classification Report:
              precision    recall  f1-score 