In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sdmetrics==0.14.1



In [3]:
%cd /content/drive/MyDrive/ORD

/content/drive/MyDrive/ORD


# ORD 성능 측정

In [13]:
import warnings
from sklearn.exceptions import ConvergenceWarning

print("adult"+"="*30)
!python compute_mle.py --dataname adult --target income --method tabsyn
print("cardio"+"="*30)
!python compute_mle.py --dataname cardio --target cardio --method tabsyn
print("fintech"+"="*30)
!python compute_mle.py --dataname fintech --target churn --method tabsyn
print("heloc"+"="*30)
!python compute_mle.py --dataname heloc --target is_at_risk --method tabsyn
print("성능측정 완료")

2038
XG:  80.4
Minority class accuracy:  81.35
Majority class accuracy:  79.45
Fraction of true: 0.804
ECE  [0.04055099]
Figure(640x480)
Figure(640x480)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Scores - 
ada: 0.812960235640648, 
dt: 0.805875152998776, 
lr: 0.7818228763232636, 
mlp: 0.798992443324937
avg: 0.7999126770719062
2055
XG:  81.375
Minority class accuracy:  82.75
Majority class accuracy:  80.0
Fraction of true: 0.81375
ECE  [0.03139633]
Figure(640x480)
Figure(640x480)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


# basic model

In [15]:
import os, glob, warnings
import numpy as np
import pandas as pd

from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)  # LR/MLP 수렴 경고 숨김
warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# XGBoost (있으면 사용)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False
    print("⚠️ xgboost not available. Skipping XGB baseline.")


# ---- Metrics ----
def expected_calibration_error_binary(p_pos: np.ndarray, y_true: np.ndarray, M: int = 10):
    p_pos = np.asarray(p_pos).reshape(-1)
    y_true = np.asarray(y_true).reshape(-1).astype(int)

    bin_edges = np.linspace(0.0, 1.0, M + 1)
    ece = 0.0
    for i in range(M):
        lo, hi = bin_edges[i], bin_edges[i + 1]
        in_bin = (p_pos > lo) & (p_pos <= hi)
        if not np.any(in_bin):
            continue
        acc = (((p_pos[in_bin] >= 0.5).astype(int) == y_true[in_bin]).mean())
        conf = p_pos[in_bin].mean()
        ece += np.abs(acc - conf) * in_bin.mean()
    return float(ece)

def minority_majority_recall(y_true, y_pred):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)
    rec_min = recall_score(y_true, y_pred, pos_label=1)
    rec_maj = recall_score(y_true, y_pred, pos_label=0)
    return float(rec_min), float(rec_maj)


# ---- Target inference ----
COMMON_TARGET_NAMES = ["income","cardio","churn","is_at_risk"]

def infer_target_column(df: pd.DataFrame, user_target=None):
    if user_target and user_target in df.columns:
        return user_target
    for c in COMMON_TARGET_NAMES:
        if c in df.columns:
            return c
    return df.columns[-1]  # fallback


# ---- Preprocessor ----
def build_preprocessor(X: pd.DataFrame):
    cat_cols = [c for c in X.columns if X[c].dtype == "object" or str(X[c].dtype).startswith("category")]
    num_cols = [c for c in X.columns if c not in cat_cols]

    pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3
    )
    return pre


# ---- Models ----
def get_models(random_state=42):
    models = {
        "LR": LogisticRegression(max_iter=2000),
        "DT": DecisionTreeClassifier(random_state=random_state),
        "ADA": AdaBoostClassifier(random_state=random_state),
        "RF": RandomForestClassifier(n_estimators=300, random_state=random_state, n_jobs=-1),
        "MLP": MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500, random_state=random_state),
    }
    if HAS_XGB:
        models["XGB"] = XGBClassifier(
            n_estimators=400,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            random_state=random_state,
            n_jobs=-1,
            eval_metric="logloss",
        )
    return models


def fit_predict_proba(pipe: Pipeline, X_train, y_train, X_test):
    pipe.fit(X_train, y_train)
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X_test)[:, 1]
    # fallback
    scores = pipe.decision_function(X_test)
    return 1 / (1 + np.exp(-scores))


In [16]:
DATA_ROOT = "/content/drive/MyDrive/ORD/data"
TRAIN_FILE = "imbalanced_noord.csv"
TEST_FILE  = "test.csv"
USER_TARGET = None                 # ex) "income"처럼 고정하고 싶으면 여기 넣어

random_state = 42
models = get_models(random_state=random_state)

dataset_dirs = sorted([p for p in glob.glob(os.path.join(DATA_ROOT, "*")) if os.path.isdir(p)])

all_rows = []
for d in dataset_dirs:
    train_path = os.path.join(d, TRAIN_FILE)
    test_path  = os.path.join(d, TEST_FILE)
    if not (os.path.exists(train_path) and os.path.exists(test_path)):
        continue

    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    target = infer_target_column(train_df, USER_TARGET)
    if target not in test_df.columns:
        # 혹시 test쪽 target명이 다르면 test 기준으로 다시 추론
        target2 = infer_target_column(test_df, USER_TARGET)
        if target2 in train_df.columns:
            target = target2
        else:
            print(f"❌ Skip {os.path.basename(d)}: target mismatch")
            continue

    X_train = train_df.drop(columns=[target])
    y_train = train_df[target].astype(int)

    X_test  = test_df.drop(columns=[target])
    y_test  = test_df[target].astype(int)

    pre = build_preprocessor(X_train)

    print(f"\n=== {os.path.basename(d)} | target={target} | n_train={len(train_df)} n_test={len(test_df)} ===")

    for name, clf in models.items():
        pipe = Pipeline([("pre", pre), ("clf", clf)])
        p_pos = fit_predict_proba(pipe, X_train, y_train, X_test)
        y_pred = (p_pos >= 0.5).astype(int)

        acc = accuracy_score(y_test, y_pred)
        rec_min, rec_maj = minority_majority_recall(y_test, y_pred)
        ece = expected_calibration_error_binary(p_pos, y_test, M=10)
        auroc = roc_auc_score(y_test, p_pos) if len(np.unique(y_test)) == 2 else np.nan

        all_rows.append({
            "dataset": os.path.basename(d),
            "train_file": TRAIN_FILE,
            "test_file": TEST_FILE,
            "target": target,
            "model": name,
            "accuracy": acc,
            "minority_recall(1)": rec_min,
            "majority_recall(0)": rec_maj,
            "auroc": auroc,
            "ece": ece,
            "n_train": len(train_df),
            "n_test": len(test_df),
        })

    # 데이터셋별 요약표
    tmp = pd.DataFrame([r for r in all_rows if r["dataset"] == os.path.basename(d)])
    display(tmp.sort_values("accuracy", ascending=False)[["model","accuracy","minority_recall(1)","majority_recall(0)","auroc","ece"]])

results_df = pd.DataFrame(all_rows)
results_df



=== adult | target=income | n_train=32654 n_test=4000 ===


Unnamed: 0,model,accuracy,minority_recall(1),majority_recall(0),auroc,ece
1,DT,0.64075,0.2995,0.982,0.64075,0.009
5,XGB,0.62775,0.256,0.9995,0.905801,0.500387
3,RF,0.6155,0.232,0.999,0.867196,0.202867
2,ADA,0.61025,0.2205,1.0,0.899063,0.13383
4,MLP,0.60375,0.215,0.9925,0.846292,0.4874
0,LR,0.55825,0.117,0.9995,0.889032,0.493379



=== cardio | target=cardio | n_train=33681 n_test=4000 ===


Unnamed: 0,model,accuracy,minority_recall(1),majority_recall(0),auroc,ece
1,DT,0.5205,0.062,0.979,0.5205,0.0105
4,MLP,0.5195,0.046,0.993,0.599378,0.48687
5,XGB,0.50025,0.0005,1.0,0.764662,0.470937
2,ADA,0.50025,0.0005,1.0,0.786756,0.020157
0,LR,0.5,0.0,1.0,0.708857,0.475762
3,RF,0.5,0.0,1.0,0.751074,0.319592



=== fintech | target=churn | n_train=14098 n_test=4000 ===


Unnamed: 0,model,accuracy,minority_recall(1),majority_recall(0),auroc,ece
1,DT,0.524,0.066,0.982,0.524,0.009
4,MLP,0.5135,0.043,0.984,0.625726,0.492227
5,XGB,0.50325,0.0065,1.0,0.722543,0.482708
2,ADA,0.50175,0.0035,1.0,0.708211,0.027416
0,LR,0.5,0.0,1.0,0.695338,0.474595
3,RF,0.5,0.0005,0.9995,0.72234,0.357205



=== heloc | target=is_at_risk | n_train=4080 n_test=2000 ===


Unnamed: 0,model,accuracy,minority_recall(1),majority_recall(0),auroc,ece
1,DT,0.522,0.062,0.982,0.529439,0.031534
4,MLP,0.5115,0.026,0.997,0.618522,0.49766
2,ADA,0.5025,0.012,0.993,0.712457,0.035195
0,LR,0.502,0.004,1.0,0.754549,0.468999
5,XGB,0.501,0.004,0.998,0.719404,0.48819
3,RF,0.5005,0.001,1.0,0.744776,0.312428


Unnamed: 0,dataset,train_file,test_file,target,model,accuracy,minority_recall(1),majority_recall(0),auroc,ece,n_train,n_test
0,adult,imbalanced_noord.csv,test.csv,income,LR,0.55825,0.117,0.9995,0.889032,0.493379,32654,4000
1,adult,imbalanced_noord.csv,test.csv,income,DT,0.64075,0.2995,0.982,0.64075,0.009,32654,4000
2,adult,imbalanced_noord.csv,test.csv,income,ADA,0.61025,0.2205,1.0,0.899063,0.13383,32654,4000
3,adult,imbalanced_noord.csv,test.csv,income,RF,0.6155,0.232,0.999,0.867196,0.202867,32654,4000
4,adult,imbalanced_noord.csv,test.csv,income,MLP,0.60375,0.215,0.9925,0.846292,0.4874,32654,4000
5,adult,imbalanced_noord.csv,test.csv,income,XGB,0.62775,0.256,0.9995,0.905801,0.500387,32654,4000
6,cardio,imbalanced_noord.csv,test.csv,cardio,LR,0.5,0.0,1.0,0.708857,0.475762,33681,4000
7,cardio,imbalanced_noord.csv,test.csv,cardio,DT,0.5205,0.062,0.979,0.5205,0.0105,33681,4000
8,cardio,imbalanced_noord.csv,test.csv,cardio,ADA,0.50025,0.0005,1.0,0.786756,0.020157,33681,4000
9,cardio,imbalanced_noord.csv,test.csv,cardio,RF,0.5,0.0,1.0,0.751074,0.319592,33681,4000
