# 5-Fold CV Baseline (Unplanned Hospitalization 6mo)

- Dataset: `anhui_ltcf_dataset.xlsx`
- Target: `unplanned_hospitalization_6mo`
- Features: **all columns except target** (optionally drops ID columns like `resident_id` if present)
- Models (5): Logistic Regression, SVM (RBF), Decision Tree, Random Forest, Gradient Boosting
- Metrics: Accuracy, Precision, Recall, F1, ROC-AUC (mean ± std across 5 folds)
- Threshold for class prediction: **0.5** (no DCA / no threshold tuning)

> If your Excel file is not in the same folder as this notebook, change `DATA_PATH` below.


**Note:** This notebook is an illustrative baseline workflow. It avoids displaying row-level sample records. Please consult the codebook for authoritative variable definitions.

In [None]:
# --- Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve
)

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [None]:
# --- Load data (dataset + codebook)
DATA_PATH = "anhui_ltcf_dataset.xlsx"               # dataset file in the repository package
CODEBOOK_PATH = "anhui_ltcf_routine_codebook.xlsx"  # codebook file in the repository package

df = pd.read_excel(DATA_PATH)
codebook = pd.read_excel(CODEBOOK_PATH)

print("Dataset shape (rows, cols):", df.shape)
print("Columns:", df.columns.tolist())

TARGET_COL = "unplanned_hospitalization_6mo"
assert TARGET_COL in df.columns, f"Target column not found: {TARGET_COL}"

# Separate y and X
y = df[TARGET_COL].astype(int).copy()
X = df.drop(columns=[TARGET_COL]).copy()

# Drop identifier columns based on the codebook (prevents leakage)
id_cols = codebook.loc[codebook["Type"].astype(str).str.lower().eq("identifier"), "Dataset column"].tolist()
id_cols = [c for c in id_cols if c in X.columns]
if id_cols:
    X = X.drop(columns=id_cols)
    print("Dropped identifier columns:", id_cols)

# (Recommended) Also drop facility_id if present, even if not flagged (safety check)
if "facility_id" in X.columns:
    X = X.drop(columns=["facility_id"])
    print("Dropped facility_id (identifier)")

print("X shape (after drops):", X.shape)
print("Outcome balance (0/1):")
print(y.value_counts().sort_index())


In [None]:
# --- Preprocessing: codebook-driven numeric/categorical pipelines
# Use the codebook to determine which features are continuous vs categorical/ordinal.
# In this dataset, most variables are integer-coded categorical/ordinal; only a small subset are continuous.

codebook_types = codebook.set_index("Dataset column")["Type"].astype(str)

# Columns present in X (after dropping identifiers)
cols_in_X = [c for c in X.columns if c in codebook_types.index]

numeric_features = [c for c in cols_in_X if codebook_types[c].lower().strip() == "numeric (continuous)"]
categorical_features = [c for c in cols_in_X if codebook_types[c].lower().strip().startswith("categorical")]

# Any remaining columns (unexpected types) will be treated as categorical to avoid scaling codes
other_features = [c for c in cols_in_X if c not in numeric_features and c not in categorical_features]
categorical_features = categorical_features + other_features

print(f"Numeric continuous features: {len(numeric_features)} -> {numeric_features}")
print(f"Categorical/ordinal features: {len(categorical_features)}")

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)


In [None]:
# --- Define 5 models (baseline hyperparams; no grid search)
models = {
    "LogReg": LogisticRegression(max_iter=2000, solver="lbfgs", random_state=RANDOM_STATE),
    "SVM(RBF)": SVC(probability=True, kernel="rbf", C=1.0, gamma="scale", random_state=RANDOM_STATE),
    "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(
        n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1
    ),
    "GBDT": GradientBoostingClassifier(random_state=RANDOM_STATE),
}

pipelines = {
    name: Pipeline(steps=[("preprocess", preprocess), ("model", mdl)])
    for name, mdl in models.items()
}


In [None]:
# --- 5-fold Stratified CV evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

summary_rows = []
oof_pred_proba = {name: np.full(shape=(len(y),), fill_value=np.nan, dtype=float) for name in pipelines.keys()}

for name, pipe in pipelines.items():
    fold_metrics = []
    for fold, (tr_idx, te_idx) in enumerate(cv.split(X, y), start=1):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]

        pipe.fit(X_tr, y_tr)

        # Probability for ROC-AUC
        proba = pipe.predict_proba(X_te)[:, 1]
        oof_pred_proba[name][te_idx] = proba

        # Class prediction with fixed 0.5 threshold
        y_pred = (proba >= 0.5).astype(int)

        m = {
            "fold": fold,
            "accuracy": accuracy_score(y_te, y_pred),
            "precision": precision_score(y_te, y_pred, zero_division=0),
            "recall": recall_score(y_te, y_pred, zero_division=0),
            "f1": f1_score(y_te, y_pred, zero_division=0),
            "roc_auc": roc_auc_score(y_te, proba),
        }
        fold_metrics.append(m)

    fold_df = pd.DataFrame(fold_metrics)
    row = {
        "model": name,
        "acc_mean": fold_df["accuracy"].mean(), "acc_std": fold_df["accuracy"].std(ddof=1),
        "prec_mean": fold_df["precision"].mean(), "prec_std": fold_df["precision"].std(ddof=1),
        "rec_mean": fold_df["recall"].mean(), "rec_std": fold_df["recall"].std(ddof=1),
        "f1_mean": fold_df["f1"].mean(), "f1_std": fold_df["f1"].std(ddof=1),
        "auc_mean": fold_df["roc_auc"].mean(), "auc_std": fold_df["roc_auc"].std(ddof=1),
    }
    summary_rows.append(row)

summary = pd.DataFrame(summary_rows).sort_values("auc_mean", ascending=False)
summary


In [None]:
# --- Pretty print mean ± std
def fmt(m, s):
    return f"{m:.3f} ± {s:.3f}"

pretty = summary.copy()
pretty["Accuracy"] = [fmt(m, s) for m, s in zip(pretty.acc_mean, pretty.acc_std)]
pretty["Precision"] = [fmt(m, s) for m, s in zip(pretty.prec_mean, pretty.prec_std)]
pretty["Recall"] = [fmt(m, s) for m, s in zip(pretty.rec_mean, pretty.rec_std)]
pretty["F1"] = [fmt(m, s) for m, s in zip(pretty.f1_mean, pretty.f1_std)]
pretty["ROC-AUC"] = [fmt(m, s) for m, s in zip(pretty.auc_mean, pretty.auc_std)]

pretty = pretty[["model", "Accuracy", "Precision", "Recall", "F1", "ROC-AUC"]]
pretty


In [None]:
# --- ROC curves (OOF probabilities)
plt.figure(figsize=(7, 6))
for name, proba in oof_pred_proba.items():
    valid = ~np.isnan(proba)
    fpr, tpr, _ = roc_curve(y[valid], proba[valid])
    auc = roc_auc_score(y[valid], proba[valid])
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("5-Fold OOF ROC Curves (All data mixed)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# --- Save results to CSV (optional)
OUT_CSV = "cv5_summary_unplanned_hospitalization.csv"
pretty.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("Saved:", OUT_CSV)
