In [3]:
# -*- coding: utf-8 -*-
"""
Support Needs Prediction (Binary) — No AutoML
================================================
- Input columns (example):
  ID, age, gender, tenure, frequent, payment_interval, subscription_type, contract_length, after_interaction, support_needs
- Goal: Predict `support_needs` (0/1) without AutoML frameworks.
- What this script does:
  1) Load & split data (stratified train/test)
  2) Preprocess (impute, scale numeric; impute + one-hot categorical)
  3) Train several fixed-parameter models (no hyperparameter search)
  4) Cross-validate on train and pick the best model by PR-AUC (average_precision)
  5) Tune decision threshold on out-of-fold predictions to maximize F1
  6) Evaluate on test (ROC-AUC, PR-AUC, F1, precision, recall, confusion matrix)
  7) Compute permutation feature importance and export artifacts
  8) Save the fitted pipeline to disk (joblib)

- Outputs (created under ./ml_outputs/):
  - cv_results.csv, best_model.txt
  - threshold_report.txt
  - test_metrics.txt, confusion_matrix.png
  - permutation_importance.csv, permutation_importance.png
  - predictions_test.csv (ID, y_true, y_prob, y_pred)
  - support_needs_model.pkl (joblib pipeline)

Notes
-----
- No AutoML libraries (e.g., autosklearn/TPOT) used; only fixed configurations and simple model selection.
- If class imbalance is strong, class_weight/scale_pos_weight are set.
- XGBoost/LightGBM are optional (used only if installed). Otherwise skip.
"""

import os
import sys
import json
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
    precision_recall_curve,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt

try:
    import joblib
except Exception:
    joblib = None

# =============================
# Config
# =============================
CSV_PATH = Path("./open/train.csv")
TARGET_COL = "support_needs"
ID_COL = "ID"
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_FOLDS = 5
TOP_K_FRAC = 0.1  # Evaluate recall within top k% highest-probability cases
OUTPUT_DIR = CSV_PATH.parent / "ml_outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Columns expected (numeric/cat). We'll infer but can be overridden here.
NUM_HINT = [
    "age", "tenure", "frequent", "payment_interval",
    "contract_length", "after_interaction"
]
CAT_HINT = ["gender", "subscription_type"]

# =============================
# Utility
# =============================

def log(msg: str):
    print(msg)


def read_data(csv_path: str) -> pd.DataFrame:
    log(f">> Loading: {csv_path}")
    df = pd.read_csv(csv_path)
    return df


def split_xy(df: pd.DataFrame):
    assert TARGET_COL in df.columns, f"Target column '{TARGET_COL}' not found."
    y = pd.to_numeric(df[TARGET_COL], errors="coerce").fillna(0).astype(int)
    X = df.drop(columns=[TARGET_COL])
    return X, y


def infer_column_types(df: pd.DataFrame):
    cols = df.columns.tolist()
    cat_cols = []
    num_cols = []
    for c in cols:
        if c == TARGET_COL:
            continue
        if c == ID_COL:
            continue
        if c in NUM_HINT:
            num_cols.append(c)
        elif c in CAT_HINT:
            cat_cols.append(c)
        else:
            # heuristic: numeric-looking -> numeric else categorical
            if pd.api.types.is_numeric_dtype(df[c]):
                num_cols.append(c)
            else:
                # Try coercion
                sample = pd.to_numeric(df[c], errors="coerce")
                if sample.notna().mean() > 0.9:
                    num_cols.append(c)
                else:
                    cat_cols.append(c)
    return num_cols, cat_cols


def build_preprocessor(num_cols, cat_cols):
    num_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    cat_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]
    )
    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    return pre


def get_models(y_train: pd.Series):
    """Return dict of fixed-parameter models (no HPO). Uses class weights for imbalance."""
    pos_rate = y_train.mean()
    neg_rate = 1 - pos_rate
    scale_pos_weight = (neg_rate / pos_rate) if pos_rate > 0 else 1.0

    models = {
        "logreg": LogisticRegression(
            max_iter=2000,
            class_weight="balanced",
            n_jobs=None if hasattr(LogisticRegression, 'n_jobs') else None,
        ),
        "rf": RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            class_weight="balanced_subsample",
            random_state=RANDOM_STATE,
            n_jobs=-1,
        ),
        "gbdt": GradientBoostingClassifier(
            random_state=RANDOM_STATE,
        ),
    }

    # Optional: XGBoost
    try:
        import xgboost as xgb
        models["xgb"] = xgb.XGBClassifier(
            n_estimators=500,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=RANDOM_STATE,
            tree_method="hist",
            eval_metric="logloss",
            n_jobs=-1,
            scale_pos_weight=scale_pos_weight,
        )
    except Exception:
        pass

    # Optional: LightGBM
    try:
        import lightgbm as lgb
        models["lgbm"] = lgb.LGBMClassifier(
            n_estimators=800,
            num_leaves=63,
            learning_rate=0.03,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=RANDOM_STATE,
            class_weight="balanced",
            n_jobs=-1,
        )
    except Exception:
        pass

    return models


def cross_validate_models(pre, models: dict, X_train: pd.DataFrame, y_train: pd.Series, n_folds: int = 5) -> pd.DataFrame:
    rows = []
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scoring = {
        "roc_auc": "roc_auc",
        "average_precision": "average_precision",  # PR-AUC
        "f1": "f1",
        "precision": "precision",
        "recall": "recall",
    }
    for name, clf in models.items():
        pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
        scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
        rows.append({
            "model": name,
            **{f"mean_{k}": np.mean(v) for k, v in scores.items() if k.startswith("test_")},
            **{f"std_{k}": np.std(v) for k, v in scores.items() if k.startswith("test_")},
        })
    df = pd.DataFrame(rows)
    order = df.sort_values("mean_test_average_precision", ascending=False)
    return order


def pick_best_model(cv_df: pd.DataFrame) -> str:
    # Select by highest PR-AUC (average_precision)
    best = cv_df.iloc[0]
    return best["model"]


def oof_threshold_search(pre, clf, X_train, y_train, n_folds: int = 5):
    """Get OOF probabilities and choose threshold that maximizes F1."""
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    oof_proba = cross_val_predict(pipe, X_train, y_train, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]
    precision, recall, thresh = precision_recall_curve(y_train, oof_proba)
    f1_vals = (2 * precision * recall) / (precision + recall + 1e-12)
    best_idx = int(np.nanargmax(f1_vals))
    best_threshold = 0.5 if best_idx >= len(thresh) else float(thresh[best_idx])
    best_f1 = float(np.nanmax(f1_vals))
    return best_threshold, best_f1, oof_proba


def evaluate_on_test(pre, clf, X_train, y_train, X_test, y_test, threshold: float):
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]

    # Metrics
    roc = roc_auc_score(y_test, proba)
    ap = average_precision_score(y_test, proba)
    y_pred = (proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)

    # Top-K recall (capture rate in top-k%)
    k = int(np.ceil(len(proba) * TOP_K_FRAC))
    top_idx = np.argsort(-proba)[:k]
    topk_recall = y_test.iloc[top_idx].sum() / y_test.sum() if y_test.sum() > 0 else 0.0

    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)

    return {
        "pipeline": pipe,
        "proba": proba,
        "y_pred": y_pred,
        "roc_auc": roc,
        "pr_auc": ap,
        "f1": f1,
        "precision": prec,
        "recall": rec,
        "topk_recall": float(topk_recall),
        "confusion_matrix": cm,
        "cls_report": report,
    }


def get_feature_names(pre, num_cols, cat_cols):
    names = []
    if len(num_cols) > 0:
        names.extend(num_cols)
    if len(cat_cols) > 0:
        ohe = pre.named_transformers_["cat"].named_steps["ohe"]
        try:
            ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
        except Exception:
            # Fallback for older scikit-learn
            ohe_names = []
            for i, c in enumerate(cat_cols):
                cats = ohe.categories_[i]
                ohe_names.extend([f"{c}_{val}" for val in cats])
        names.extend(ohe_names)
    return names


def plot_confusion_matrix(cm, outpath: Path):
    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=[0,1], yticks=[0,1], xticklabels=[0,1], yticklabels=[0,1],
           ylabel='True label', xlabel='Predicted label', title='Confusion Matrix')
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'), ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    fig.savefig(outpath, dpi=140)
    plt.close(fig)


def plot_permutation_importance(pipe, X_test, y_test, feature_names, out_csv: Path, out_png: Path):
    r = permutation_importance(pipe, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1, scoring="average_precision")
    imp = pd.DataFrame({"feature": feature_names, "importance_mean": r.importances_mean, "importance_std": r.importances_std})
    imp.sort_values("importance_mean", ascending=False, inplace=True)
    imp.to_csv(out_csv, index=False)

    top = imp.head(20)
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.barh(top["feature"][::-1], top["importance_mean"][::-1])
    ax.set_title("Permutation Importance (PR-AUC)")
    ax.set_xlabel("Mean Importance")
    fig.tight_layout()
    fig.savefig(out_png, dpi=140)
    plt.close(fig)


def main():
    df = read_data(CSV_PATH)

    # Optional: keep ID aside
    id_series = None
    if ID_COL in df.columns:
        id_series = df[ID_COL].astype(str)

    X, y = split_xy(df)

    # Infer column types
    num_cols, cat_cols = infer_column_types(X)
    log(f"Numeric cols ({len(num_cols)}): {num_cols}")
    log(f"Categorical cols ({len(cat_cols)}): {cat_cols}")

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )

    # Preprocessor
    pre = build_preprocessor(num_cols, cat_cols)

    # Models
    models = get_models(y_train)
    log(f"Models considered: {list(models.keys())}")

    # Cross-validate models on train
    cv_df = cross_validate_models(pre, models, X_train, y_train, n_folds=N_FOLDS)
    cv_path = OUTPUT_DIR / "cv_results.csv"
    cv_df.to_csv(cv_path, index=False)
    log(f"Saved CV results -> {cv_path}")

    best_name = pick_best_model(cv_df)
    best_clf = models[best_name]
    with open(OUTPUT_DIR / "best_model.txt", "w", encoding="utf-8") as f:
        f.write(f"Best by PR-AUC: {best_name}\n")
        f.write(cv_df.head(1).to_string(index=False))
    log(f"Best model: {best_name}")

    # Threshold tuning on OOF predictions
    thr, best_f1_oof, oof_proba = oof_threshold_search(pre, best_clf, X_train, y_train, n_folds=N_FOLDS)
    with open(OUTPUT_DIR / "threshold_report.txt", "w", encoding="utf-8") as f:
        f.write(json.dumps({
            "oof_best_threshold": thr,
            "oof_best_f1": best_f1_oof,
        }, indent=2))
    log(f"Chosen threshold (OOF, max F1): {thr:.4f} | OOF F1: {best_f1_oof:.4f}")

    # Final train on full train, evaluate on test
    result = evaluate_on_test(pre, best_clf, X_train, y_train, X_test, y_test, threshold=thr)

    # Save metrics
    metrics_path = OUTPUT_DIR / "test_metrics.txt"
    with open(metrics_path, "w", encoding="utf-8") as f:
        f.write(
            f"ROC-AUC: {result['roc_auc']:.4f}\nPR-AUC: {result['pr_auc']:.4f}\nF1: {result['f1']:.4f}\n"
            f"Precision: {result['precision']:.4f}\nRecall: {result['recall']:.4f}\nTop{int(TOP_K_FRAC*100)}% Recall: {result['topk_recall']:.4f}\n\n"
            f"Classification Report:\n{result['cls_report']}\n"
        )
    log(f"Saved test metrics -> {metrics_path}")

    # Save confusion matrix plot
    cm_path = OUTPUT_DIR / "confusion_matrix.png"
    plot_confusion_matrix(result["confusion_matrix"], cm_path)
    log(f"Saved confusion matrix -> {cm_path}")

    # Save predictions (test)
    pred_path = OUTPUT_DIR / "predictions_test.csv"
    out_df = pd.DataFrame({
        ID_COL: X_test[ID_COL].astype(str) if (ID_COL in X_test.columns) else np.arange(len(X_test)),
        "y_true": y_test.values,
        "y_prob": result["proba"],
        "y_pred": result["y_pred"],
    })
    out_df.to_csv(pred_path, index=False)
    log(f"Saved test predictions -> {pred_path}")

    # Permutation importance
    # Fit final pipeline on TRAIN (already fit inside evaluate_on_test), reuse it
    pipe = result["pipeline"]

    # Build feature name list from fitted preprocessor
    pre_fitted = pipe.named_steps["pre"]
    feature_names = get_feature_names(pre_fitted, num_cols, cat_cols)

    imp_csv = OUTPUT_DIR / "permutation_importance.csv"
    imp_png = OUTPUT_DIR / "permutation_importance.png"
    plot_permutation_importance(pipe, X_test, y_test, feature_names, imp_csv, imp_png)
    log(f"Saved permutation importance -> {imp_csv}, {imp_png}")

    # Save model
    if joblib is not None:
        model_path = OUTPUT_DIR / "support_needs_model.pkl"
        joblib.dump(pipe, model_path)
        log(f"Saved pipeline -> {model_path}")
    else:
        log("joblib not available; skipping model serialization.")

    # Quick imbalance hint
    pos_rate = y.mean()
    if pos_rate < 0.1:
        log(f"NOTE: Positive rate is low ({pos_rate:.3f}). Consider alternative thresholds or cost-sensitive objectives.")


if __name__ == "__main__":
    main()


>> Loading: open/train.csv
Numeric cols (6): ['age', 'tenure', 'frequent', 'payment_interval', 'contract_length', 'after_interaction']
Categorical cols (2): ['gender', 'subscription_type']
Models considered: ['logreg', 'rf', 'gbdt']
Saved CV results -> open/ml_outputs/cv_results.csv
Best model: gbdt


ValueError: multiclass format is not supported

In [6]:
# -*- coding: utf-8 -*-
"""
Support Needs Prediction (Multiclass) — No AutoML
=================================================
- Input columns (example):
  ID, age, gender, tenure, frequent, payment_interval, subscription_type, contract_length, after_interaction, support_needs
- Goal: Predict `support_needs` **(multiclass)** without AutoML frameworks.
- What this script does:
  1) Load & split data (stratified train/test)
  2) Preprocess (impute, scale numeric; impute + one-hot categorical)
  3) Train several fixed-parameter **multiclass** models (no hyperparameter search)
  4) Cross-validate and pick the best model by **macro PR-friendly metrics** (e.g., F1-macro, log loss, ROC-AUC OVR macro)
  5) (No binary threshold tuning) — use argmax of class probabilities
  6) Evaluate on test (Accuracy, Balanced Acc, F1-macro, Precision/Recall-macro, LogLoss, ROC-AUC OVR macro)
  7) Compute permutation feature importance and export artifacts
  8) Save the fitted pipeline to disk (joblib)

- Outputs (created under ./ml_outputs/):
  - cv_results.csv, best_model.txt
  - test_metrics.txt, confusion_matrix.png
  - permutation_importance.csv, permutation_importance.png
  - predictions_test.csv (ID, y_true, y_pred, proba_<class>...)
  - support_needs_model.pkl (joblib pipeline)

Notes
-----
- No AutoML libraries used; only fixed configurations and simple model selection.
- Multiclass metrics emphasize **macro averages** to treat classes uniformly.
"""

import os
import json
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    log_loss,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
    top_k_accuracy_score,
    make_scorer,
)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
import matplotlib.pyplot as plt

try:
    import joblib
except Exception:
    joblib = None

# =============================
# Config
# =============================
CSV_PATH = Path("./open/train.csv")
TARGET_COL = "support_needs"
ID_COL = "ID"
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_FOLDS = 5
OUTPUT_DIR = CSV_PATH.parent / "ml_outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Columns expected (numeric/cat). We'll infer but can be overridden here.
NUM_HINT = [
    "age", "tenure", "frequent", "payment_interval",
    "contract_length", "after_interaction"
]
CAT_HINT = ["gender", "subscription_type"]

# =============================
# Utility
# =============================

def log(msg: str):
    print(msg)


def read_data(csv_path: str) -> pd.DataFrame:
    log(f">> Loading: {csv_path}")
    df = pd.read_csv(csv_path)
    return df


def split_xy(df: pd.DataFrame):
    assert TARGET_COL in df.columns, f"Target column '{TARGET_COL}' not found."
    y = df[TARGET_COL]  # keep labels as-is (can be strings)
    X = df.drop(columns=[TARGET_COL])
    return X, y


def infer_column_types(df: pd.DataFrame):
    cols = df.columns.tolist()
    cat_cols = []
    num_cols = []
    for c in cols:
        if c == TARGET_COL or c == ID_COL:
            continue
        if c in NUM_HINT:
            num_cols.append(c)
        elif c in CAT_HINT:
            cat_cols.append(c)
            
        else:
            if pd.api.types.is_numeric_dtype(df[c]):
                num_cols.append(c)
            else:
                sample = pd.to_numeric(df[c], errors="coerce")
                if sample.notna().mean() > 0.9:
                    num_cols.append(c)
                else:
                    cat_cols.append(c)
    return num_cols, cat_cols


def build_preprocessor(num_cols, cat_cols):
    num_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    cat_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]
    )
    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    return pre


def get_models():
    """Return dict of fixed-parameter multiclass models (no HPO)."""
    models = {
        "logreg_multinomial": LogisticRegression(
            max_iter=1000,
            multi_class="multinomial",
            class_weight="balanced",
            solver="lbfgs",
            n_jobs=None if hasattr(LogisticRegression, 'n_jobs') else None,
        ),
        "rf": RandomForestClassifier(
            n_estimators=400,
            class_weight="balanced_subsample",
            random_state=RANDOM_STATE,
            n_jobs=-1,
        ),
        "gbdt": GradientBoostingClassifier(random_state=RANDOM_STATE),
        "hist_gbdt": HistGradientBoostingClassifier(
            random_state=RANDOM_STATE,
            max_depth=None,
        ),
    }
    return models


def build_scoring():
    scoring = {
        "accuracy": "accuracy",
        "balanced_accuracy": "balanced_accuracy",
        "f1_macro": "f1_macro",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "neg_log_loss": "neg_log_loss",
        "roc_auc_ovr_macro": make_scorer(roc_auc_score, needs_proba=True, multi_class="ovr", average="macro"),
        "top3_accuracy": make_scorer(top_k_accuracy_score, k=3),
    }
    return scoring


def cross_validate_models(pre, models: dict, X_train: pd.DataFrame, y_train: pd.Series, n_folds: int = 5) -> pd.DataFrame:
    rows = []
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scoring = build_scoring()
    for name, clf in models.items():
        pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
        scores = cross_validate(
            pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False
        )
        row = {"model": name}
        for k, v in scores.items():
            if k.startswith("test_"):
                m = k.replace("test_", "mean_")
                s = k.replace("test_", "std_")
                row[m] = np.mean(v)
                row[s] = np.std(v)
        rows.append(row)
    df = pd.DataFrame(rows)
    # Primary sort key: F1-macro, secondary: neg_log_loss (higher is better as it's neg), tertiary: roc_auc
    df = df.sort_values(["mean_f1_macro", "mean_neg_log_loss", "mean_roc_auc_ovr_macro"], ascending=[False, False, False])
    return df


def pick_best_model(cv_df: pd.DataFrame) -> str:
    best = cv_df.iloc[0]
    return best["model"]


def evaluate_on_test(pre, clf, X_train, y_train, X_test, y_test):
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    proba = None
    if hasattr(pipe, "predict_proba") or hasattr(pipe.named_steps["clf"], "predict_proba"):
        proba = pipe.predict_proba(X_test)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    bacc = balanced_accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    precm = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recm = recall_score(y_test, y_pred, average="macro")

    ll = None
    roc_macro = None
    if proba is not None:
        ll = log_loss(y_test, proba)
        try:
            roc_macro = roc_auc_score(y_test, proba, multi_class="ovr", average="macro")
        except Exception:
            roc_macro = None

    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)

    return {
        "pipeline": pipe,
        "proba": proba,
        "y_pred": y_pred,
        "accuracy": acc,
        "balanced_accuracy": bacc,
        "f1_macro": f1m,
        "precision_macro": precm,
        "recall_macro": recm,
        "log_loss": ll,
        "roc_auc_ovr_macro": roc_macro,
        "confusion_matrix": cm,
        "cls_report": report,
    }


def get_feature_names(pre, num_cols, cat_cols):
    names = []
    if len(num_cols) > 0:
        names.extend(num_cols)
    if len(cat_cols) > 0:
        ohe = pre.named_transformers_["cat"].named_steps["ohe"]
        try:
            ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
        except Exception:
            ohe_names = []
            for i, c in enumerate(cat_cols):
                cats = ohe.categories_[i]
                ohe_names.extend([f"{c}_{val}" for val in cats])
        names.extend(ohe_names)
    return names


def plot_confusion_matrix(cm, class_labels, outpath: Path):
    fig, ax = plt.subplots(figsize=(6, 5))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
    disp.plot(ax=ax, cmap="Blues", values_format='d', colorbar=True)
    ax.set_title("Confusion Matrix")
    fig.tight_layout()
    fig.savefig(outpath, dpi=140)
    plt.close(fig)


def plot_permutation_importance(pipe, X_test, y_test, feature_names, out_csv: Path, out_png: Path):
    # Use neg_log_loss to value probability quality across classes
    r = permutation_importance(
        pipe, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1, scoring="neg_log_loss"
    )
    imp = pd.DataFrame({"feature": feature_names, "importance_mean": r.importances_mean, "importance_std": r.importances_std})
    imp.sort_values("importance_mean", ascending=False, inplace=True)
    imp.to_csv(out_csv, index=False)

    top = imp.head(20)
    fig, ax = plt.subplots(figsize=(8, 7))
    ax.barh(top["feature"][::-1], top["importance_mean"][::-1])
    ax.set_title("Permutation Importance (neg_log_loss)")
    ax.set_xlabel("Mean Importance")
    fig.tight_layout()
    fig.savefig(out_png, dpi=140)
    plt.close(fig)


def main():
    df = read_data(CSV_PATH)

    # Optional: keep ID aside
    id_series = df[ID_COL].astype(str) if ID_COL in df.columns else None

    X, y = split_xy(df)

    # Infer column types
    num_cols, cat_cols = infer_column_types(X)
    log(f"Numeric cols ({len(num_cols)}): {num_cols}")
    log(f"Categorical cols ({len(cat_cols)}): {cat_cols}")

    # Train/test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )

    # Preprocessor
    pre = build_preprocessor(num_cols, cat_cols)

    # Models
    models = get_models()
    log(f"Models considered: {list(models.keys())}")

    # Cross-validate models on train
    cv_df = cross_validate_models(pre, models, X_train, y_train, n_folds=N_FOLDS)
    cv_path = OUTPUT_DIR / "cv_results.csv"
    cv_df.to_csv(cv_path, index=False)
    log(f"Saved CV results -> {cv_path}")

    best_name = pick_best_model(cv_df)
    best_clf = models[best_name]
    with open(OUTPUT_DIR / "best_model.txt", "w", encoding="utf-8") as f:
        f.write(f"Best by F1-macro, neg_log_loss, ROC-AUC OVR macro: {best_name}")
        f.write(cv_df.head(1).to_string(index=False))
    log(f"Best model: {best_name}")

    # Final train and evaluate on test
    result = evaluate_on_test(pre, best_clf, X_train, y_train, X_test, y_test)

    # Save metrics
    metrics_path = OUTPUT_DIR / "test_metrics.txt"
    with open(metrics_path, "w", encoding="utf-8") as f:
        f.write(
            (
                f"Accuracy: {result['accuracy']:.4f}"
                f"Balanced Acc: {result['balanced_accuracy']:.4f}"
                f"F1-macro: {result['f1_macro']:.4f}"
                f"Precision-macro: {result['precision_macro']:.4f}"
                f"Recall-macro: {result['recall_macro']:.4f}"
            )
        )
        if result["log_loss"] is not None:
            f.write(f"LogLoss: {result['log_loss']:.4f}")
        if result["roc_auc_ovr_macro"] is not None:
            f.write(f"ROC-AUC (OVR macro): {result['roc_auc_ovr_macro']:.4f}")
        f.write("Classification Report:")
        f.write(result["cls_report"])    
    log(f"Saved test metrics -> {metrics_path}")

    # Save confusion matrix plot
    classes = None
    # get class labels from fitted estimator
    try:
        classes = result["pipeline"].named_steps["clf"].classes_
    except Exception:
        classes = np.unique(y)

    cm_path = OUTPUT_DIR / "confusion_matrix.png"
    plot_confusion_matrix(result["confusion_matrix"], classes, cm_path)
    log(f"Saved confusion matrix -> {cm_path}")

    # Save predictions (test), including per-class probabilities if available
    pred_path = OUTPUT_DIR / "predictions_test.csv"
    out_df = pd.DataFrame({
        ID_COL: X_test[ID_COL].astype(str) if (ID_COL in X_test.columns) else np.arange(len(X_test)),
        "y_true": y_test.values,
        "y_pred": result["y_pred"],
    })
    if result["proba"] is not None:
        for i, c in enumerate(classes):
            out_df[f"proba_{c}"] = result["proba"][:, i]
    out_df.to_csv(pred_path, index=False)
    log(f"Saved test predictions -> {pred_path}")

    # Permutation importance (uses neg_log_loss scorer)
    pipe = result["pipeline"]
    pre_fitted = pipe.named_steps["pre"]
    feature_names = get_feature_names(pre_fitted, num_cols, cat_cols)
    imp_csv = OUTPUT_DIR / "permutation_importance.csv"
    imp_png = OUTPUT_DIR / "permutation_importance.png"
    plot_permutation_importance(pipe, X_test, y_test, feature_names, imp_csv, imp_png)
    log(f"Saved permutation importance -> {imp_csv}, {imp_png}")

    # Save model
    if joblib is not None:
        model_path = OUTPUT_DIR / "support_needs_model.pkl"
        from joblib import dump
        dump(pipe, model_path)
        log(f"Saved pipeline -> {model_path}")
    else:
        log("joblib not available; skipping model serialization.")


if __name__ == "__main__":
    main()


>> Loading: open/train.csv
Numeric cols (6): ['age', 'tenure', 'frequent', 'payment_interval', 'contract_length', 'after_interaction']
Categorical cols (2): ['gender', 'subscription_type']
Models considered: ['logreg_multinomial', 'rf', 'gbdt', 'hist_gbdt']
Saved CV results -> open/ml_outputs/cv_results.csv
Best model: rf
Saved test metrics -> open/ml_outputs/test_metrics.txt
Saved confusion matrix -> open/ml_outputs/confusion_matrix.png
Saved test predictions -> open/ml_outputs/predictions_test.csv


ValueError: All arrays must be of the same length

In [15]:
# -*- coding: utf-8 -*-
"""
Support Needs Prediction (Multiclass) — No AutoML
=================================================
- Input columns (example):
  ID, age, gender, tenure, frequent, payment_interval, subscription_type, contract_length, after_interaction, support_needs
- Goal: Predict `support_needs` **(multiclass)** without AutoML frameworks.
- What this script does:
  1) Load & split data (stratified train/test)
  2) Preprocess (impute, scale numeric; impute + one-hot categorical)
  3) Train several fixed-parameter **multiclass** models (no hyperparameter search)
  4) Cross-validate and pick the best model by **macro PR-friendly metrics** (e.g., F1-macro, log loss, ROC-AUC OVR macro)
  5) (No binary threshold tuning) — use argmax of class probabilities
  6) Evaluate on test (Accuracy, Balanced Acc, F1-macro, Precision/Recall-macro, LogLoss, ROC-AUC OVR macro)
  7) Compute permutation feature importance and export artifacts
  8) Save the fitted pipeline to disk (joblib)

- Outputs (created under ./ml_outputs/):
  - cv_results.csv, best_model.txt
  - test_metrics.txt, confusion_matrix.png
  - permutation_importance.csv, permutation_importance.png
  - predictions_test.csv (ID, y_true, y_pred, proba_<class>...)
  - support_needs_model.pkl (joblib pipeline)

Notes
-----
- No AutoML libraries used; only fixed configurations and simple model selection.
- Multiclass metrics emphasize **macro averages** to treat classes uniformly.
"""

import os
import json
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    log_loss,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
    top_k_accuracy_score,
    make_scorer,
)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
import matplotlib.pyplot as plt

try:
    import joblib
except Exception:
    joblib = None

# =============================
# Config
# =============================
CSV_PATH = Path("./open/train.csv")
TARGET_COL = "support_needs"
ID_COL = "ID"
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_FOLDS = 5
OUTPUT_DIR = CSV_PATH.parent / "ml_outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)




# Columns expected (numeric/cat). We'll infer but can be overridden here.
NUM_HINT = [
    "age", "tenure", "frequent", "payment_interval",
    "contract_length", "after_interaction"
]
CAT_HINT = ["gender", "subscription_type"]

# =============================
# Utility
# =============================

def log(msg: str):
    print(msg)


def read_data(csv_path: str) -> pd.DataFrame:
    log(f">> Loading: {csv_path}")
    df = pd.read_csv(csv_path)
    return df


def split_xy(df: pd.DataFrame):
    assert TARGET_COL in df.columns, f"Target column '{TARGET_COL}' not found."
    y = df[TARGET_COL]  # keep labels as-is (can be strings)
    X = df.drop(columns=[TARGET_COL])
    return X, y


def infer_column_types(df: pd.DataFrame):
    cols = df.columns.tolist()
    cat_cols = []
    num_cols = []
    for c in cols:
        if c == TARGET_COL or c == ID_COL:
            continue
        if c in NUM_HINT:
            num_cols.append(c)
        elif c in CAT_HINT:
            cat_cols.append(c)
            
        else:
            if pd.api.types.is_numeric_dtype(df[c]):
                num_cols.append(c)
            else:
                sample = pd.to_numeric(df[c], errors="coerce")
                if sample.notna().mean() > 0.9:
                    num_cols.append(c)
                else:
                    cat_cols.append(c)
    return num_cols, cat_cols


def build_preprocessor(num_cols, cat_cols):
    num_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    cat_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]
    )
    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    return pre


def get_models(y_train: pd.Series):
    """Return dict of fixed-parameter multiclass models (no HPO)."""

    pos_rate = y_train.mean()
    neg_rate = 1 - pos_rate
    scale_pos_weight = (neg_rate / pos_rate) if pos_rate > 0 else 1.0
    models = {
        "logreg_multinomial": LogisticRegression(
            max_iter=1000,
            multi_class="multinomial",
            class_weight="balanced",
            solver="lbfgs",
            n_jobs=None if hasattr(LogisticRegression, 'n_jobs') else None,
        ),
        "rf": RandomForestClassifier(
            n_estimators=400,
            class_weight="balanced_subsample",
            random_state=RANDOM_STATE,
            n_jobs=-1,
        ),
        "gbdt": GradientBoostingClassifier(random_state=RANDOM_STATE),
        "hist_gbdt": HistGradientBoostingClassifier(
            random_state=RANDOM_STATE,
            max_depth=None,
        ),
    }

        # Optional: XGBoost
    try:
        import xgboost as xgb
        models["xgb"] = xgb.XGBClassifier(
            n_estimators=500,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=RANDOM_STATE,
            tree_method="hist",
            eval_metric="logloss",
            n_jobs=-1,
            scale_pos_weight=scale_pos_weight,
        )
    except Exception as e:
        log(e)
        pass

    # Optional: LightGBM
    try:
        import lightgbm as lgb
        models["lgbm"] = lgb.LGBMClassifier(
            n_estimators=800,
            num_leaves=63,
            learning_rate=0.03,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=RANDOM_STATE,
            class_weight="balanced",
            n_jobs=-1,
        )
    except Exception as e:
        log(e)
        pass
    
    return models


def build_scoring():
    scoring = {
        "accuracy": "accuracy",
        "balanced_accuracy": "balanced_accuracy",
        "f1_macro": "f1_macro",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "neg_log_loss": "neg_log_loss",
        "roc_auc_ovr_macro": make_scorer(roc_auc_score, needs_proba=True, multi_class="ovr", average="macro"),
        "top3_accuracy": make_scorer(top_k_accuracy_score, k=3),
    }
    return scoring


def cross_validate_models(pre, models: dict, X_train: pd.DataFrame, y_train: pd.Series, n_folds: int = 5) -> pd.DataFrame:
    rows = []
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scoring = build_scoring()
    for name, clf in models.items():
        pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
        scores = cross_validate(
            pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False
        )
        row = {"model": name}
        for k, v in scores.items():
            if k.startswith("test_"):
                m = k.replace("test_", "mean_")
                s = k.replace("test_", "std_")
                row[m] = np.mean(v)
                row[s] = np.std(v)
        rows.append(row)
    df = pd.DataFrame(rows)
    # Primary sort key: F1-macro, secondary: neg_log_loss (higher is better as it's neg), tertiary: roc_auc
    df = df.sort_values(["mean_f1_macro", "mean_neg_log_loss", "mean_roc_auc_ovr_macro"], ascending=[False, False, False])
    return df


def pick_best_model(cv_df: pd.DataFrame) -> str:
    best = cv_df.iloc[0]
    return best["model"]


def evaluate_on_test(pre, clf, X_train, y_train, X_test, y_test):
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    proba = None
    if hasattr(pipe, "predict_proba") or hasattr(pipe.named_steps["clf"], "predict_proba"):
        proba = pipe.predict_proba(X_test)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    bacc = balanced_accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    precm = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recm = recall_score(y_test, y_pred, average="macro")

    ll = None
    roc_macro = None
    if proba is not None:
        ll = log_loss(y_test, proba)
        try:
            roc_macro = roc_auc_score(y_test, proba, multi_class="ovr", average="macro")
        except Exception:
            roc_macro = None

    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)

    return {
        "pipeline": pipe,
        "proba": proba,
        "y_pred": y_pred,
        "accuracy": acc,
        "balanced_accuracy": bacc,
        "f1_macro": f1m,
        "precision_macro": precm,
        "recall_macro": recm,
        "log_loss": ll,
        "roc_auc_ovr_macro": roc_macro,
        "confusion_matrix": cm,
        "cls_report": report,
    }


def get_feature_names(pre, num_cols, cat_cols):
    names = []
    if len(num_cols) > 0:
        names.extend(num_cols)
    if len(cat_cols) > 0:
        ohe = pre.named_transformers_["cat"].named_steps["ohe"]
        try:
            ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
        except Exception:
            ohe_names = []
            for i, c in enumerate(cat_cols):
                cats = ohe.categories_[i]
                ohe_names.extend([f"{c}_{val}" for val in cats])
        names.extend(ohe_names)
    return names


def plot_confusion_matrix(cm, class_labels, outpath: Path):
    fig, ax = plt.subplots(figsize=(6, 5))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
    disp.plot(ax=ax, cmap="Blues", values_format='d', colorbar=True)
    ax.set_title("Confusion Matrix")
    fig.tight_layout()
    fig.savefig(outpath, dpi=140)
    plt.close(fig)


def plot_permutation_importance(pipe, X_test, y_test, feature_names, out_csv: Path, out_png: Path):
    """
    Compute permutation importance **after preprocessing** so the number of
    features matches the expanded (OHE) feature space. This avoids length
    mismatches between `feature_names` (from the fitted preprocessor) and the
    importances returned by sklearn.
    """
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]

    # Transform input into model space (after ColumnTransformer + OHE + scaling)
    Xt = pre.transform(X_test)

    r = permutation_importance(
        clf, Xt, y_test,
        n_repeats=10,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        scoring="neg_log_loss",
    )

    # Safety: if lengths still mismatch, fall back to index names
    if len(feature_names) != r.importances_mean.shape[0]:
        feature_names = [f"f{i}" for i in range(r.importances_mean.shape[0])]

    imp = pd.DataFrame({
        "feature": feature_names,
        "importance_mean": r.importances_mean,
        "importance_std": r.importances_std,
    })
    imp.sort_values("importance_mean", ascending=False, inplace=True)
    imp.to_csv(out_csv, index=False)

    top = imp.head(20)
    fig, ax = plt.subplots(figsize=(8, 7))
    ax.barh(top["feature"][::-1], top["importance_mean"][::-1])
    ax.set_title("Permutation Importance (neg_log_loss)")
    ax.set_xlabel("Mean Importance")
    fig.tight_layout()
    fig.savefig(out_png, dpi=140)
    plt.close(fig)


def main():
    df = read_data(CSV_PATH)

    # Optional: keep ID aside
    id_series = df[ID_COL].astype(str) if ID_COL in df.columns else None

    X, y = split_xy(df)

    # Infer column types
    num_cols, cat_cols = infer_column_types(X)
    log(f"Numeric cols ({len(num_cols)}): {num_cols}")
    log(f"Categorical cols ({len(cat_cols)}): {cat_cols}")

    # Train/test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )

    # Preprocessor
    pre = build_preprocessor(num_cols, cat_cols)

    # Models
    models = get_models(y_train)
    log(f"Models considered: {list(models.keys())}")

    # Cross-validate models on train
    cv_df = cross_validate_models(pre, models, X_train, y_train, n_folds=N_FOLDS)
    cv_path = OUTPUT_DIR / "cv_results.csv"
    cv_df.to_csv(cv_path, index=False)
    log(f"Saved CV results -> {cv_path}")

    best_name = pick_best_model(cv_df)
    best_clf = models[best_name]
    with open(OUTPUT_DIR / "best_model.txt", "w", encoding="utf-8") as f:
        f.write(f"Best by F1-macro, neg_log_loss, ROC-AUC OVR macro: {best_name}")
        f.write(cv_df.head(1).to_string(index=False))
    log(f"Best model: {best_name}")

    # Final train and evaluate on test
    result = evaluate_on_test(pre, best_clf, X_train, y_train, X_test, y_test)

    # Save metrics
    metrics_path = OUTPUT_DIR / "test_metrics.txt"
    with open(metrics_path, "w", encoding="utf-8") as f:
        f.write(
            (
                f"Accuracy: {result['accuracy']:.4f}"
                f"Balanced Acc: {result['balanced_accuracy']:.4f}"
                f"F1-macro: {result['f1_macro']:.4f}"
                f"Precision-macro: {result['precision_macro']:.4f}"
                f"Recall-macro: {result['recall_macro']:.4f}"
            )
        )
        if result["log_loss"] is not None:
            f.write(f"LogLoss: {result['log_loss']:.4f}")
        if result["roc_auc_ovr_macro"] is not None:
            f.write(f"ROC-AUC (OVR macro): {result['roc_auc_ovr_macro']:.4f}")
        f.write("Classification Report:")
        f.write(result["cls_report"])    
    log(f"Saved test metrics -> {metrics_path}")

    # Save confusion matrix plot
    classes = None
    # get class labels from fitted estimator
    try:
        classes = result["pipeline"].named_steps["clf"].classes_
    except Exception:
        classes = np.unique(y)

    cm_path = OUTPUT_DIR / "confusion_matrix.png"
    plot_confusion_matrix(result["confusion_matrix"], classes, cm_path)
    log(f"Saved confusion matrix -> {cm_path}")

    # Save predictions (test), including per-class probabilities if available
    pred_path = OUTPUT_DIR / "predictions_test.csv"
    out_df = pd.DataFrame({
        ID_COL: X_test[ID_COL].astype(str) if (ID_COL in X_test.columns) else np.arange(len(X_test)),
        "y_true": y_test.values,
        "y_pred": result["y_pred"],
    })
    if result["proba"] is not None:
        for i, c in enumerate(classes):
            out_df[f"proba_{c}"] = result["proba"][:, i]
    out_df.to_csv(pred_path, index=False)
    log(f"Saved test predictions -> {pred_path}")

    # Permutation importance (uses neg_log_loss scorer)
    pipe = result["pipeline"]
    pre_fitted = pipe.named_steps["pre"]
    feature_names = get_feature_names(pre_fitted, num_cols, cat_cols)
    imp_csv = OUTPUT_DIR / "permutation_importance.csv"
    imp_png = OUTPUT_DIR / "permutation_importance.png"
    plot_permutation_importance(pipe, X_test, y_test, feature_names, imp_csv, imp_png)
    log(f"Saved permutation importance -> {imp_csv}, {imp_png}")

    # Save model
    if joblib is not None:
        model_path = OUTPUT_DIR / "support_needs_model.pkl"
        from joblib import dump
        dump(pipe, model_path)
        log(f"Saved pipeline -> {model_path}")
    else:
        log("joblib not available; skipping model serialization.")


if __name__ == "__main__":
    main()


>> Loading: open/train.csv
Numeric cols (6): ['age', 'tenure', 'frequent', 'payment_interval', 'contract_length', 'after_interaction']
Categorical cols (2): ['gender', 'subscription_type']
Models considered: ['logreg_multinomial', 'rf', 'gbdt', 'hist_gbdt', 'xgb', 'lgbm']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 218
[LightGBM] [Info] Total Bins 218
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory

In [None]:
import pandas as pd, json, joblib


file_path = OUTPUT_DIR / "support_needs_model.pkl"
pipe = joblib.load("ml_outputs/support_needs_model.pkl")

In [21]:
CSV_PATH.parent / "test.csv"


PosixPath('open/test.csv')

In [24]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

MODEL_PATH = OUTPUT_DIR / "support_needs_model.pkl"
TEST_CSV  = CSV_PATH.parent / "test.csv"
OUT_PATH  = OUTPUT_DIR / "predictions_real_test.csv"
ID_COL    = "ID"
TARGET_COL = "support_needs"  # 실제 테스트에 라벨이 없다면 자동으로 무시됨

# 1) 모델 로드
pipe = joblib.load(MODEL_PATH)

# 2) 테스트 로드
df_test = pd.read_csv(TEST_CSV)

# (옵션) 필요한 입력 컬럼 확인 & 누락 시 NaN으로 채워 전처리기에 맡기기
pre = pipe.named_steps["pre"]
required_cols = []
for name, trans, cols in pre.transformers:
    if name in ("num", "cat"):
        required_cols += list(cols)

missing = [c for c in required_cols if c not in df_test.columns]
for c in missing:
    df_test[c] = np.nan  # SimpleImputer가 처리

# 3) 예측
y_pred = pipe.predict(df_test)
proba = None
if hasattr(pipe.named_steps["clf"], "predict_proba"):
    proba = pipe.predict_proba(df_test)
classes = pipe.named_steps["clf"].classes_

# 4) 저장 (ID 포함)
out = pd.DataFrame({
    ID_COL: df_test[ID_COL].astype(str) if ID_COL in df_test.columns else np.arange(len(df_test)),
    "support_needs": y_pred
})
# if proba is not None:
#     for i, c in enumerate(classes):
#         out[f"proba_{c}"] = proba[:, i]

# (옵션) 테스트에 정답 라벨이 있으면 간단 평가도 출력
if TARGET_COL in df_test.columns:
    from sklearn.metrics import classification_report
    print(classification_report(df_test[TARGET_COL], y_pred, digits=4))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)
print(f"Saved -> {OUT_PATH.resolve()}")


Saved -> /Users/jeongho/git/python-playground/customer_tier_classification/open/ml_outputs/predictions_real_test.csv
