In [3]:
# ---------------------------------------------------------------
# Taiwan Credit‑Card Default Prediction – ML baseline script (v3)
# Otimized for Balanced Accuracy (BalAcc)
# Dataset: "Default of Credit Card Clients" (UCI, 30 000 rows, 23 features)
# ---------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve,
    precision_recall_curve
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module=r"sklearn\..*")

RANDOM_STATE = 42

# helpers

def signed_log1p(x: np.ndarray) -> np.ndarray:
    x = x.astype(float)
    return np.sign(x) * np.log1p(np.abs(x))

log_transformer = FunctionTransformer(signed_log1p, validate=False)

# preprocessing builder

def make_preprocessor(df):
    bills_and_pays = [f"BILL_AMT{i}" for i in range(1, 7)] + [f"PAY_AMT{i}" for i in range(1, 7)]
    num_cols = [
        "LIMIT_BAL", "AGE",
        *[f"PAY_RATIO{i}" for i in range(1, 7)],
        "TOTAL_BILL_6M", "TOTAL_PAY_6M", "LATE_MONTHS_COUNT"
    ]
    cat_cols = ["SEX", "EDUCATION", "MARRIAGE"]
    pay_status_cols = [c for c in df.columns if c.startswith("PAY_") and not c.startswith("PAY_AMT")]

    return ColumnTransformer([
        ("log_bill", Pipeline([("log", log_transformer), ("scaler", StandardScaler())]), bills_and_pays),
        ("scale_num", StandardScaler(), num_cols),
        ("onehot_cat", OneHotEncoder(drop="first", sparse_output=False), cat_cols),
        ("pass_pay", "passthrough", pay_status_cols)
    ], remainder="drop", verbose_feature_names_out=False)

# model‑pipeline builder

def make_pipeline(clf, preprocessor):
    return ImbPipeline([
        ("preproc", preprocessor),
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", clf)
    ])

# main

if __name__ == "__main__":
    # 1. Load
    df = pd.read_csv("default of credit card clients.csv", sep=";", header=1)
    df.drop_duplicates(inplace=True)
    df.fillna(0, inplace=True)

    # 2. Feature engineering
    for i in range(1, 7):
        denom = df[f"BILL_AMT{i}"] + 1
        df[f"PAY_RATIO{i}"] = np.where(denom > 0, df[f"PAY_AMT{i}"] / denom, 0.0)
    df["TOTAL_BILL_6M"] = df[[f"BILL_AMT{i}" for i in range(1, 7)]].sum(axis=1)
    df["TOTAL_PAY_6M"] = df[[f"PAY_AMT{i}" for i in range(1, 7)]].sum(axis=1)
    pay_status_cols = [c for c in df.columns if c.startswith("PAY_") and not c.startswith("PAY_AMT")]
    df["LATE_MONTHS_COUNT"] = (df[pay_status_cols] > 0).sum(axis=1)

    target_col = "default payment next month"
    X, y = df.drop(columns=["ID", target_col]), df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE)

    preprocessor = make_preprocessor(df)

    # 3. models & grids
    models = {
        "KNN": KNeighborsClassifier(),
        "SVM": SVC(kernel="rbf", probability=True, random_state=RANDOM_STATE, class_weight="balanced"),
        "AdaBoost": AdaBoostClassifier(random_state=RANDOM_STATE)
    }

    param_grids = {
        "KNN": {
            "clf__n_neighbors": [1, 3, 5, 7, 10, 15, 20, 30, 40, 50],
            "clf__weights": ["uniform", "distance"],
            "clf__p": [1, 2]
        },
        "SVM": {
            "clf__C": [0.1, 1, 10],
            "clf__gamma": ["scale", 0.01, 0.1]
        },
        "AdaBoost": {
            "clf__n_estimators": [50, 100, 200],
            "clf__learning_rate": [0.5, 1.0, 1.5]
        }
    }

    scoring_balacc = 'balanced_accuracy'

    best_pipes, results = {}, {}
    plt.figure(figsize=(8, 6))

    for name, model in models.items():
        print(f"\n=== GridSearch ({name}) optimise BalAcc ===")
        grid = GridSearchCV(
            make_pipeline(model, preprocessor),
            param_grids[name],
            cv=StratifiedKFold(3, shuffle=True, random_state=RANDOM_STATE),
            scoring=scoring_balacc,
            n_jobs=-1,
            return_train_score=True
        )
        grid.fit(X_train, y_train)
        print("Best params:", grid.best_params_)
        print("CV BalAcc:", grid.best_score_)

        best = grid.best_estimator_
        best_pipes[name] = best

        # --- metrics ---
        y_pred = best.predict(X_test)
        y_proba = best.predict_proba(X_test)[:, 1] if hasattr(best.named_steps["clf"], "predict_proba") else best.decision_function(X_test)

        res = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "BalAcc": balanced_accuracy_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
            "ROC_AUC": roc_auc_score(y_test, y_proba)
        }
        results[name] = res

        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC={res['ROC_AUC']:.3f})")

    # -- ROC plot
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC curves (BalAcc‑optimised)"); plt.legend(); plt.grid(True); plt.tight_layout(); plt.savefig("roc_curves_v3.png"); plt.close()

    # -- PR curve
    plt.figure(figsize=(8,6))
    for name, best in best_pipes.items():
        y_pred_final = best.predict(X_test)
        print(f"Confusion matrix ({name}):", confusion_matrix(y_test, y_pred_final))
        print(classification_report(y_test, y_pred_final))


=== GridSearch (KNN) optimise BalAcc ===
Best params: {'clf__n_neighbors': 50, 'clf__p': 1, 'clf__weights': 'uniform'}
CV BalAcc: 0.6816412508324116

=== GridSearch (SVM) optimise BalAcc ===
Best params: {'clf__C': 1, 'clf__gamma': 0.01}
CV BalAcc: 0.7024647740159432

=== GridSearch (AdaBoost) optimise BalAcc ===
Best params: {'clf__learning_rate': 0.5, 'clf__n_estimators': 50}
CV BalAcc: 0.6942668684784111
Confusion matrix (KNN): [[4262 1579]
 [ 614 1045]]
              precision    recall  f1-score   support

           0       0.87      0.73      0.80      5841
           1       0.40      0.63      0.49      1659

    accuracy                           0.71      7500
   macro avg       0.64      0.68      0.64      7500
weighted avg       0.77      0.71      0.73      7500

Confusion matrix (SVM): [[4841 1000]
 [ 715  944]]
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      5841
           1       0.49      0.57      0.52      1

<Figure size 800x600 with 0 Axes>