In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv("german_credit_data.csv")

# Handle missing values (replace 'NA' with 'unknown')
df.replace("NA", "unknown", inplace=True)

# Encode categorical features
categorical_cols = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Simulated labels: high credit amount AND high duration → bad credit
y_true = (
    (df['Credit amount'] > df['Credit amount'].mean()) &
    (df['Duration'] > df['Duration'].mean())
).astype(int)


credit_duration_ratio = df['Credit amount'] / (df['Duration'] + 1)
X = df.copy()
X['Credit/Dur'] = credit_duration_ratio

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dimensionality reduction using PCA
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Helper to align cluster labels
def align_labels(y_true, y_pred):
    if precision_score(y_true, y_pred) < 0.5:
        return 1 - y_pred
    return y_pred

# Evaluation function
def evaluate_model(name, y_true, y_pred):
    print(f"\n=== {name} Evaluation ===")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_true, y_pred):.4f}")

# =============================
# 1. Gaussian Mixture Model
# =============================
gmm = GaussianMixture(n_components=2, random_state=42)
y_gmm = gmm.fit_predict(X_pca)
y_gmm_aligned = align_labels(y_true, y_gmm)
evaluate_model("Gaussian Mixture", y_true, y_gmm_aligned)





=== Gaussian Mixture Evaluation ===
Accuracy:  0.8360
Precision: 0.6872
Recall:    0.6549
F1 Score:  0.6707
