In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

# =========================
# Set ONCE
# =========================
DATA_DIR = Path("/teamspace/studios/this_studio/detecting_Sepsis/data")

# Point directly to the dataset variant you want to use:
MODE_DIR = DATA_DIR / "SignatureFeatureSets" / "rich_FE__orig1_hc1_sig1_csig1_lb7_so3_cso3_ll1"
#MODE_DIR = DATA_DIR / "Low_Preproc_NoFe_CSV"
#MODE_DIR = DATA_DIR / "raw_CSV"

# One run folder name (used under 4_Evaluation/Predictions/)
RUN_NAME = "LIGHTGBM_Signatures"   # change once per experiment

# =========================
# Conventions
# =========================
PATIENT_COL = "Patient_ID"
TIME_COL    = "ICULOS"
LABEL_COL   = "SepsisLabel"

# =========================
# Resolve dataset files from MODE_DIR (AUTO)
# =========================
def resolve_dataset_paths(mode_dir: Path):
    if not mode_dir.exists():
        raise FileNotFoundError(f"MODE_DIR does not exist: {mode_dir}")

    candidates = [
        # Signature-augmented preferred names
        ("train_fit_HIGH_PREPROC_NO_FE_with_signatures.csv",
         "train_thresh_HIGH_PREPROC_NO_FE_with_signatures.csv",
         "test_HIGH_PREPROC_NO_FE_with_signatures.csv"),
        # Signature fallback names
        ("train_fit_with_signatures.csv",
         "train_thresh_with_signatures.csv",
         "test_with_signatures.csv"),
        # HIGH
        ("train_fit_HIGH_PREPROC_NO_FE.csv",
         "train_thresh_HIGH_PREPROC_NO_FE.csv",
         "test_HIGH_PREPROC_NO_FE.csv"),
        # LOW
        ("train_fit_LOW_PREPROC_NO_FE.csv",
         "train_thresh_LOW_PREPROC_NO_FE.csv",
         "test_LOW_PREPROC_NO_FE.csv"),
        # RAW
        ("train_fit.csv",
         "train_thresh.csv",
         "test.csv"),
    ]

    for fit, thresh, test in candidates:
        p_fit = mode_dir / fit
        p_thr = mode_dir / thresh
        p_tst = mode_dir / test
        if p_fit.exists() and p_thr.exists() and p_tst.exists():
            return p_fit, p_thr, p_tst

    # Helpful error message
    existing = sorted([p.name for p in mode_dir.glob("*.csv")])
    raise FileNotFoundError(
        f"Could not resolve dataset files in MODE_DIR={mode_dir}\n"
        f"Expected one of these naming schemes:\n"
        f"  SIG : train_fit_HIGH_PREPROC_NO_FE_with_signatures.csv + train_thresh_HIGH_PREPROC_NO_FE_with_signatures.csv + test_HIGH_PREPROC_NO_FE_with_signatures.csv\n"
        f"  SIG2: train_fit_with_signatures.csv + train_thresh_with_signatures.csv + test_with_signatures.csv\n"
        f"  HIGH: train_fit_HIGH_PREPROC_NO_FE.csv + train_thresh_HIGH_PREPROC_NO_FE.csv + test_HIGH_PREPROC_NO_FE.csv\n"
        f"  LOW : train_fit_LOW_PREPROC_NO_FE.csv  + train_thresh_LOW_PREPROC_NO_FE.csv  + test_LOW_PREPROC_NO_FE.csv\n"
        f"  RAW : train_fit.csv + train_thresh.csv + test.csv\n"
        f"Existing CSV files in MODE_DIR:\n  {existing}"
    )

TRAIN_FIT_PATH, TRAIN_THRESH_PATH, TEST_PATH = resolve_dataset_paths(MODE_DIR)

# =========================
# Prediction / model output dirs
# =========================
PRED_ROOT = DATA_DIR.parent / "4_Evaluation" / "Predictions" / RUN_NAME
MODEL_DIR = PRED_ROOT / "models"
PRED_DIR_THRESH_WORK = PRED_ROOT / "pred_psv_TRAIN_THRESH_WORK"
PRED_DIR_TEST_WORK   = PRED_ROOT / "pred_psv_TEST_WORK"

for d in [MODEL_DIR, PRED_DIR_THRESH_WORK, PRED_DIR_TEST_WORK]:
    d.mkdir(parents=True, exist_ok=True)

print("MODE_DIR:", MODE_DIR)
print("TRAIN_FIT:", TRAIN_FIT_PATH)
print("TRAIN_THRESH:", TRAIN_THRESH_PATH)
print("TEST:", TEST_PATH)
print("PRED_ROOT:", PRED_ROOT)



MODE_DIR: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV
TRAIN_FIT: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV/train_fit_HIGH_PREPROC_NO_FE.csv
TRAIN_THRESH: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV/train_thresh_HIGH_PREPROC_NO_FE.csv
TEST: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV/test_HIGH_PREPROC_NO_FE.csv
PRED_ROOT: /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe


In [2]:
def load_preproc_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_csv(path)
    # drop accidental index cols
    df = df.loc[:, ~df.columns.str.contains(r"^Unnamed")]
    return df

train_fit = load_preproc_csv(TRAIN_FIT_PATH)
train_thresh = load_preproc_csv(TRAIN_THRESH_PATH)   # keep for threshold evaluation later
test_df = load_preproc_csv(TEST_PATH)

# basic checks
for df_name, df in [("train_fit", train_fit), ("train_thresh", train_thresh), ("test", test_df)]:
    for c in [PATIENT_COL, TIME_COL]:
        if c not in df.columns:
            raise ValueError(f"{df_name} missing required column: {c}")

if LABEL_COL not in train_fit.columns:
    raise ValueError("train_fit must contain SepsisLabel")

# v1-style sort (only use columns that exist)
sort_cols = [c for c in [PATIENT_COL, "ICULOS", "Hour"] if c in train_fit.columns]
train_fit = train_fit.sort_values(sort_cols).reset_index(drop=True)

# -----------------------------
# v1 feature definition:
#   drop ONLY Patient_ID and SepsisLabel
#   KEEP ICULOS / Hour if they exist
# -----------------------------
drop_cols = {PATIENT_COL, LABEL_COL}
feature_cols = [c for c in train_fit.columns if c not in drop_cols]

print("train_fit shape:", train_fit.shape)
print("train_thresh shape:", train_thresh.shape)
print("test shape:", test_df.shape)
print("n_features:", len(feature_cols))
print("First 15 features:", feature_cols[:15])


train_fit shape: (1180166, 78)
train_thresh shape: (61120, 78)
test shape: (310924, 78)
n_features: 76
First 15 features: ['Hour', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2']


In [3]:
# patient-level stratification label (v1): patient ever septic
patient_y = train_fit.groupby(PATIENT_COL)[LABEL_COL].max().astype(int)
patient_ids = patient_y.index.to_numpy()
patient_labels = patient_y.values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_iters = []
fold_metrics = []  # (auroc, auprc)

for fold, (tr_idx, va_idx) in enumerate(skf.split(patient_ids, patient_labels), 1):
    tr_pids = set(patient_ids[tr_idx])
    va_pids = set(patient_ids[va_idx])

    tr = train_fit[train_fit[PATIENT_COL].isin(tr_pids)]
    va = train_fit[train_fit[PATIENT_COL].isin(va_pids)]

    X_tr = tr[feature_cols]
    y_tr = tr[LABEL_COL].astype(int).to_numpy()

    X_va = va[feature_cols]
    y_va = va[LABEL_COL].astype(int).to_numpy()

    # v1-style fold scale_pos_weight (row-level)
    pos = int((y_tr == 1).sum())
    neg = int((y_tr == 0).sum())
    spw = neg / max(pos, 1)

    model = lgb.LGBMClassifier(
        objective="binary",
        metric="auc",
        n_estimators=4000,
        learning_rate=0.03,
        num_leaves=64,
        min_child_samples=50,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        n_jobs=-1,
        random_state=42,
        scale_pos_weight=spw,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(100)]
    )

    bi = int(model.best_iteration_ or model.n_estimators_)
    best_iters.append(bi)

    p_va = model.predict_proba(X_va)[:, 1]
    auroc = roc_auc_score(y_va, p_va)
    auprc = average_precision_score(y_va, p_va)
    fold_metrics.append((auroc, auprc))

    print(f"Fold {fold}: best_iter={bi}  pos={pos} neg={neg} spw={spw:.2f}  AUROC={auroc:.4f}  AUPRC={auprc:.4f}")

best_iters = np.array(best_iters, dtype=int)
m = np.array(fold_metrics)

print("\nFold best_iterations:", best_iters.tolist())
print(f"Mean AUROC={m[:,0].mean():.4f} ± {m[:,0].std():.4f}")
print(f"Mean AUPRC={m[:,1].mean():.4f} ± {m[:,1].std():.4f}")


[LightGBM] [Info] Number of positive: 17004, number of negative: 927794
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11530
[LightGBM] [Info] Number of data points in the train set: 944798, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017997 -> initscore=-3.999361
[LightGBM] [Info] Start training from score -3.999361
Training until validation scores don't improve for 200 rounds
[100]	valid_0's auc: 0.825569
[200]	valid_0's auc: 0.827437
[300]	valid_0's auc: 0.826072
Early stopping, best iteration is:
[177]	valid_0's auc: 0.827729
Fold 1: best_iter=177  pos=17004 neg=927794 spw=54.56  AUROC=0.8277  AUPRC=0.0944
[LightGBM] [Info] Number of positive: 16985, number of negative: 927378
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhe

In [4]:
# Robust automatic final #trees: median of fold best_iterations
BEST_N = int(np.median(best_iters))
print("Auto BEST_N (median of folds):", BEST_N)

X_full = train_fit[feature_cols]
y_full = train_fit[LABEL_COL].astype(int).to_numpy()

# v1-style full-data scale_pos_weight
pos = int((y_full == 1).sum())
neg = int((y_full == 0).sum())
scale_pos_weight = neg / max(pos, 1)

final_model = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    n_estimators=BEST_N,
    learning_rate=0.03,
    num_leaves=64,
    min_child_samples=50,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    force_row_wise=True,
)

final_model.fit(X_full, y_full)

# IMPORTANT: keep name `booster` for your existing v2 prediction-writing cell
booster = final_model.booster_

print("Trained final model.")
print("Full train pos:", pos, "neg:", neg, "scale_pos_weight:", scale_pos_weight)
print("Final model trees:", BEST_N)


Auto BEST_N (median of folds): 177
[LightGBM] [Info] Number of positive: 21247, number of negative: 1158919
[LightGBM] [Info] Total Bins 11541
[LightGBM] [Info] Number of data points in the train set: 1180166, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018003 -> initscore=-3.999027
[LightGBM] [Info] Start training from score -3.999027
Trained final model.
Full train pos: 21247 neg: 1158919 scale_pos_weight: 54.54506518567327
Final model trees: 177


In [5]:
MODE_TAG = MODE_DIR.name

MODEL_PATH = MODEL_DIR / f"lgbm_model_{MODE_TAG}_BESTN_{BEST_N}.txt"
FEATURES_PATH = MODEL_DIR / f"lgbm_feature_names_{MODE_TAG}_BESTN_{BEST_N}.txt"

booster.save_model(str(MODEL_PATH), num_iteration=BEST_N)

with open(FEATURES_PATH, "w") as f:
    for c in feature_cols:
        f.write(c + "\n")

print("Saved model:", MODEL_PATH)
print("Saved features:", FEATURES_PATH)


Saved model: /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe/models/lgbm_model_High_Preproc_NoFe_CSV_BESTN_177.txt
Saved features: /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe/models/lgbm_feature_names_High_Preproc_NoFe_CSV_BESTN_177.txt


In [6]:
def ensure_empty_dir(dir_path: Path):
    dir_path.mkdir(parents=True, exist_ok=True)
    for f in dir_path.glob("*.psv"):
        f.unlink()

def write_prob_file(out_path: Path, prob: np.ndarray):
    pd.DataFrame({"PredictedProbability": prob.astype(float)}).to_csv(out_path, sep="|", index=False)

def write_psv_predictions_from_preproc_csv(df: pd.DataFrame, out_dir: Path, booster: lgb.Booster):
    ensure_empty_dir(out_dir)

    df = df.drop(columns=[LABEL_COL], errors="ignore")
    df = df.sort_values([PATIENT_COL, TIME_COL]).reset_index(drop=True)

    model_features = booster.feature_name()
    missing = [c for c in model_features if c not in df.columns]
    if missing:
        raise ValueError(f"Missing {len(missing)} model features in inference df. Example: {missing[:10]}")

    n_files = 0
    for pid, g in df.groupby(PATIENT_COL, sort=False):
        g = g.sort_values(TIME_COL)
        X = g[model_features]
        prob = booster.predict(X, num_iteration=BEST_N)

        out_path = out_dir / f"p{int(pid):06d}.psv"
        write_prob_file(out_path, prob)
        n_files += 1

    print(f"Wrote {n_files} PSV files -> {out_dir}")

# Train_thresh predictions (for threshold sweep)
write_psv_predictions_from_preproc_csv(train_thresh, PRED_DIR_THRESH_WORK, booster)

# Test predictions
write_psv_predictions_from_preproc_csv(test_df, PRED_DIR_TEST_WORK, booster)


Wrote 1614 PSV files -> /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe/pred_psv_TRAIN_THRESH_WORK
Wrote 8068 PSV files -> /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe/pred_psv_TEST_WORK


In [7]:
print("RUN_NAME:", RUN_NAME)
print("Model:", MODEL_PATH.name)
print("Features:", FEATURES_PATH.name)
print("Thresh work:", PRED_DIR_THRESH_WORK)
print("Test work:", PRED_DIR_TEST_WORK)

print("\nNext: run Threshold_Sweep_And_Official_Eval.ipynb")
print("Point it to these WORK folders to compute official utility + final thresholded folders.")


RUN_NAME: LIGHTGBM_HighPreproc_NoFe
Model: lgbm_model_High_Preproc_NoFe_CSV_BESTN_177.txt
Features: lgbm_feature_names_High_Preproc_NoFe_CSV_BESTN_177.txt
Thresh work: /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe/pred_psv_TRAIN_THRESH_WORK
Test work: /teamspace/studios/this_studio/detecting_Sepsis/4_Evaluation/Predictions/LIGHTGBM_HighPreproc_NoFe/pred_psv_TEST_WORK

Next: run Threshold_Sweep_And_Official_Eval.ipynb
Point it to these WORK folders to compute official utility + final thresholded folders.
