In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

In [5]:
# Project root
BASE_DIR = Path("/teamspace/studios/this_studio/detecting_Sepsis")

DATA_DIR = BASE_DIR / "data"
RAW_CSV_DIR  = DATA_DIR / "raw_CSV"
LOW_DIR      = DATA_DIR / "Low_Preproc_NoFe_CSV"
HIGH_DIR     = DATA_DIR / "High_Preproc_NoFe_CSV"

for d in [RAW_CSV_DIR, LOW_DIR, HIGH_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Raw inputs (your current files)
TRAIN_FIT_CSV    = RAW_CSV_DIR / "train_fit.csv"
TRAIN_THRESH_CSV = RAW_CSV_DIR / "train_thresh.csv"
TEST_CSV         = RAW_CSV_DIR / "test.csv"

# Column conventions
PATIENT_COL = "Patient_ID"
TIME_COL    = "ICULOS"
LABEL_COL   = "SepsisLabel"

# HIGH recency settings (match your loader)
RECENCY_DECAY = 0.9
NO_RECENCY_COLS = {"Age","Gender","Unit1","Unit2","HospAdmTime","ICULOS"}

print("TRAIN_FIT:", TRAIN_FIT_CSV)
print("TRAIN_THRESH:", TRAIN_THRESH_CSV)
print("TEST:", TEST_CSV)
print("LOW_OUT_DIR:", LOW_DIR)
print("HIGH_OUT_DIR:", HIGH_DIR)

TRAIN_FIT: /teamspace/studios/this_studio/detecting_Sepsis/data/raw_CSV/train_fit.csv
TRAIN_THRESH: /teamspace/studios/this_studio/detecting_Sepsis/data/raw_CSV/train_thresh.csv
TEST: /teamspace/studios/this_studio/detecting_Sepsis/data/raw_CSV/test.csv
LOW_OUT_DIR: /teamspace/studios/this_studio/detecting_Sepsis/data/Low_Preproc_NoFe_CSV
HIGH_OUT_DIR: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV


In [5]:
def load_raw_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Drop accidental index columns like Unnamed: 0
    df = df.loc[:, ~df.columns.str.contains(r"^Unnamed")]
    return df

def get_feature_cols(df: pd.DataFrame) -> list[str]:
    # Features are everything except patient/time/label
    drop_cols = {PATIENT_COL, TIME_COL, LABEL_COL}
    return [c for c in df.columns if c not in drop_cols]

def recency_from_missing(missing_sub: np.ndarray, decay: float = 0.9) -> np.ndarray:
    """
    missing_sub: bool [T, F] where True = missing at time t
    recency[t] = decay*recency[t-1], set to 1.0 when observed at t
    """
    T, F = missing_sub.shape
    rec = np.zeros((T, F), dtype=np.float32)
    for t in range(T):
        obs = ~missing_sub[t]
        if t == 0:
            rec[t, obs] = 1.0
        else:
            rec[t] = rec[t-1] * decay
            rec[t, obs] = 1.0
    return rec

def make_low_preproc(df: pd.DataFrame, feature_cols: list[str]) -> pd.DataFrame:
    out = df.copy()
    out = out.sort_values([PATIENT_COL, TIME_COL]).reset_index(drop=True)
    out[feature_cols] = out[feature_cols].apply(pd.to_numeric, errors="coerce")
    out[feature_cols] = (
        out.groupby(PATIENT_COL, sort=False)[feature_cols]
           .ffill()
           .fillna(0.0)
    )
    return out

def make_high_preproc(df: pd.DataFrame, feature_cols: list[str], decay: float = 0.9) -> pd.DataFrame:
    out = df.copy()
    out = out.sort_values([PATIENT_COL, TIME_COL]).reset_index(drop=True)

    # numeric coercion (non-numeric -> NaN)
    out[feature_cols] = out[feature_cols].apply(pd.to_numeric, errors="coerce")

    # dynamic cols that get recency
    dyn_cols = [c for c in feature_cols if c not in NO_RECENCY_COLS]
    dyn_idx = [feature_cols.index(c) for c in dyn_cols]

    recency_parts = []
    for pid, g in out.groupby(PATIENT_COL, sort=False):
        X_raw = g[feature_cols].to_numpy(dtype=np.float32, copy=True)
        missing = np.isnan(X_raw)

        if len(dyn_idx) == 0:
            rec = np.zeros((len(g), 0), dtype=np.float32)
        else:
            rec = recency_from_missing(missing[:, dyn_idx], decay=decay)

        rec_df = pd.DataFrame(
            rec,
            index=g.index,
            columns=[f"recency_{c}" for c in dyn_cols],
        )
        recency_parts.append(rec_df)

    rec_all = pd.concat(recency_parts).sort_index()

    # Apply LOW value preprocessing (ffill then 0)
    out[feature_cols] = (
        out.groupby(PATIENT_COL, sort=False)[feature_cols]
           .ffill()
           .fillna(0.0)
    )

    # Append recency columns
    out = pd.concat([out, rec_all], axis=1)
    return out


In [6]:
def preprocess_and_write(in_path: Path, low_dir: Path, high_dir: Path):
    print(f"\n=== Processing: {in_path.name} ===")
    df = load_raw_csv(in_path)

    # basic checks
    for c in [PATIENT_COL, TIME_COL, LABEL_COL]:
        if c not in df.columns:
            raise ValueError(f"{in_path.name}: missing required column '{c}'")

    feature_cols = get_feature_cols(df)
    print("Rows:", len(df), "| Feature cols:", len(feature_cols))

    df_low = make_low_preproc(df, feature_cols)
    df_high = make_high_preproc(df, feature_cols, decay=RECENCY_DECAY)

    low_out  = low_dir  / in_path.name.replace(".csv", "_LOW_PREPROC_NO_FE.csv")
    high_out = high_dir / in_path.name.replace(".csv", "_HIGH_PREPROC_NO_FE.csv")

    df_low.to_csv(low_out, index=False)
    df_high.to_csv(high_out, index=False)

    print("Wrote LOW :", low_out.name,  "shape:", df_low.shape)
    print("Wrote HIGH:", high_out.name, "shape:", df_high.shape)
    return low_out, high_out

train_fit_low, train_fit_high = preprocess_and_write(TRAIN_FIT_CSV, LOW_DIR, HIGH_DIR)
train_thresh_low, train_thresh_high = preprocess_and_write(TRAIN_THRESH_CSV, LOW_DIR, HIGH_DIR)
test_low, test_high = preprocess_and_write(TEST_CSV, LOW_DIR, HIGH_DIR)

print("\nDONE preprocessing.")
print("train_fit_high:", train_fit_high)
print("train_thresh_high:", train_thresh_high)
print("test_high:", test_high)



=== Processing: train_fit.csv ===
Rows: 1180166 | Feature cols: 40
Wrote LOW : train_fit_LOW_PREPROC_NO_FE.csv shape: (1180166, 43)
Wrote HIGH: train_fit_HIGH_PREPROC_NO_FE.csv shape: (1180166, 78)

=== Processing: train_thresh.csv ===
Rows: 61120 | Feature cols: 40
Wrote LOW : train_thresh_LOW_PREPROC_NO_FE.csv shape: (61120, 43)
Wrote HIGH: train_thresh_HIGH_PREPROC_NO_FE.csv shape: (61120, 78)

=== Processing: test.csv ===
Rows: 310924 | Feature cols: 40
Wrote LOW : test_LOW_PREPROC_NO_FE.csv shape: (310924, 43)
Wrote HIGH: test_HIGH_PREPROC_NO_FE.csv shape: (310924, 78)

DONE preprocessing.
train_fit_high: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV/train_fit_HIGH_PREPROC_NO_FE.csv
train_thresh_high: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV/train_thresh_HIGH_PREPROC_NO_FE.csv
test_high: /teamspace/studios/this_studio/detecting_Sepsis/data/High_Preproc_NoFe_CSV/test_HIGH_PREPROC_NO_FE.csv


In [6]:
# ============================================================
# Create LOW + Missingness CSVs (from RAW + LOW)
# ============================================================

OUT_DIR = DATA_DIR / "Low_Preproc_WithMissing_CSV"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def drop_unnamed(df):
    return df.loc[:, ~df.columns.str.contains(r"^Unnamed")]

def add_missingness(raw_path: Path, low_path: Path, out_path: Path):
    raw = drop_unnamed(pd.read_csv(raw_path))
    low = drop_unnamed(pd.read_csv(low_path))

    # feature columns = LOW features (excluding id/time/label)
    drop_cols = {PATIENT_COL, TIME_COL, LABEL_COL}
    feat_cols = [c for c in low.columns if c not in drop_cols]

    # ensure RAW has same feature columns
    for c in feat_cols:
        if c not in raw.columns:
            raw[c] = np.nan

    raw = raw.sort_values([PATIENT_COL, TIME_COL]).reset_index(drop=True)
    low = low.sort_values([PATIENT_COL, TIME_COL]).reset_index(drop=True)

    # missingness from RAW (before ffill!)
    miss = raw[feat_cols].apply(pd.to_numeric, errors="coerce").isna().astype(np.int8)
    miss.columns = [f"miss_{c}" for c in miss.columns]

    miss_df = pd.concat([raw[[PATIENT_COL, TIME_COL]], miss], axis=1)

    merged = low.merge(
        miss_df,
        on=[PATIENT_COL, TIME_COL],
        how="left",
        validate="one_to_one"
    )

    if len(merged) != len(low):
        raise RuntimeError("Row mismatch after merge")

    merged.to_csv(out_path, index=False)
    print(f"Wrote {out_path.name} | added {miss.shape[1]} miss_* columns")

# --- run for all splits ---
add_missingness(
    RAW_CSV_DIR / "train_fit.csv",
    LOW_DIR / "train_fit_LOW_PREPROC_NO_FE.csv",
    OUT_DIR / "train_fit_LOW_PREPROC_WITH_MISSING.csv",
)

add_missingness(
    RAW_CSV_DIR / "train_thresh.csv",
    LOW_DIR / "train_thresh_LOW_PREPROC_NO_FE.csv",
    OUT_DIR / "train_thresh_LOW_PREPROC_WITH_MISSING.csv",
)

add_missingness(
    RAW_CSV_DIR / "test.csv",
    LOW_DIR / "test_LOW_PREPROC_NO_FE.csv",
    OUT_DIR / "test_LOW_PREPROC_WITH_MISSING.csv",
)

print("\nDONE → use CSVs from:", OUT_DIR)


Wrote train_fit_LOW_PREPROC_WITH_MISSING.csv | added 40 miss_* columns
Wrote train_thresh_LOW_PREPROC_WITH_MISSING.csv | added 40 miss_* columns
Wrote test_LOW_PREPROC_WITH_MISSING.csv | added 40 miss_* columns

DONE → use CSVs from: /teamspace/studios/this_studio/detecting_Sepsis/data/Low_Preproc_WithMissing_CSV


In [7]:
# Quick sanity check: pick one patient from train_thresh_high and verify monotonic time ordering
df_check = pd.read_csv(train_thresh_high)
pid = int(df_check[PATIENT_COL].iloc[0])
g = df_check[df_check[PATIENT_COL] == pid].sort_values(TIME_COL)

print("Sample pid:", pid)
print("Rows:", len(g))
print("ICULOS head:", g[TIME_COL].head(10).tolist())
print("ICULOS tail:", g[TIME_COL].tail(10).tolist())


Sample pid: 11
Rows: 34
ICULOS head: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
ICULOS tail: [26, 27, 28, 29, 30, 31, 32, 33, 34, 35]
