In [5]:
# Ensure repository root on sys.path for `import app.*`
import sys
from pathlib import Path
repo_root = (Path.cwd() / '..').resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print('Repo root:', repo_root)


Repo root: C:\Users\Golds\Downloads\survival-readmission


# 03 - Survival Modeling

This notebook trains and evaluates survival models:
- Cox Proportional Hazards
- XGBoost Survival
- Other survival models


In [6]:
# Import libraries
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from app.evaluation import compute_concordance_index, compute_td_auc


ModuleNotFoundError: No module named 'sksurv'

## Quick Feature Importance (Tree-based proxy)

As a fast heuristic, train a tree-based classifier on the 30-day event label to inspect feature importances. This is a proxy (not a survival objective) to surface candidate predictors; we will follow up with survival-specific modeling (Cox, XGBoost Survival).


In [None]:
# Load cohort and build features
cohort = pd.read_csv('../data/processed/cohort_30d.csv', parse_dates=['admittime','dischtime','next_admittime'])

# Build features via app.feature_engineering (includes labs when available)
from app.feature_engineering import engineer_features
import os

# Set the MIMIC_DEMO_DIR environment variable to ensure data loader finds the data
os.environ['MIMIC_DEMO_DIR'] = '../data/raw/mimic-iv-demo'

X, y_df = engineer_features(cohort)
y = y_df['event'].astype(int)

print('Feature matrix shape:', X.shape)
print('Num lab-derived columns:', sum([1 for col in X.columns if str(col).startswith('lab_')]))
print('Sample lab columns:', [col for col in X.columns if str(col).startswith('lab_')][:5])


In [None]:
# Train/test split and RandomForest quick AUC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Simple train/test split by index (for demo; prefer temporal split in practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Event rate in train: {y_train.mean():.3f}")
print(f"Event rate in test: {y_test.mean():.3f}")

# Quick RandomForest for feature importance (proxy, not survival-specific)
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5)
rf.fit(X_train, y_train)

# Quick AUC (30-day classification proxy)
y_pred_proba = rf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Quick AUC (30-day classification proxy): {auc:.3f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 features:")
print(feature_importance.head(15))


In [7]:
# Load cohort from previous step (ensure variable exists before feature engineering)
import pandas as pd
cohort = pd.read_csv('../data/processed/cohort_30d.csv', parse_dates=['admittime','dischtime','next_admittime'])


In [8]:
# Build features via app.feature_engineering (includes labs when available)
from app.feature_engineering import engineer_features

X, y_df = engineer_features(cohort)
y = y_df['event'].astype(int)

print('Feature matrix shape:', X.shape)
print('Num lab-derived columns:', sum([1 for col in X.columns if str(col).startswith('lab_')]))


Feature matrix shape: (260, 20)
Num lab-derived columns: 0


In [None]:
# Setup: add repo root to path for `app.*`
import sys
from pathlib import Path
repo_root = (Path.cwd() / '..').resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print('Repo root:', repo_root)


Repo root: C:\Users\Golds\Downloads\survival-readmission


## Cox Proportional Hazards (baseline)
Fit an interpretable Cox PH model and report C-index on a holdout split.


In [None]:
# Prepare data for Cox PH
from lifelines import CoxPHFitter
from app.evaluation import compute_concordance_index

cox_df = X.copy()
cox_df["time_to_event"] = y_df["time_to_event"].values
cox_df["event"] = y_df["event"].values

# Simple train/test split by index (for demo; prefer temporal split in practice)
train_idx, test_idx = X_train.index, X_test.index
cox_train = cox_df.loc[train_idx]
cox_test = cox_df.loc[test_idx]

cph = CoxPHFitter(penalizer=0.1, l1_ratio=0.1)
cph.fit(cox_train.assign(event=cox_train["event"].astype(bool)), duration_col="time_to_event", event_col="event", show_progress=False)

# Risk scores and C-index
risk_scores = cph.predict_partial_hazard(cox_test)
c_index = compute_concordance_index(cox_test["event"], cox_test["time_to_event"], risk_scores)
print(f"Cox PH C-index (holdout): {c_index:.3f}")


## XGBoost Survival (Cox objective)
Train an XGBoost model with survival:cox to produce risk scores and compute C-index.


In [None]:
import xgboost as xgb

# Prepare DMatrix for XGBoost Cox
# For Cox, label is time; event is provided implicitly via order weighting
# We'll follow the typical approach: sort by time and pass (time, event) via special settings.

# Create DMatrix with features
dtrain = xgb.DMatrix(X_train, label=y_df.loc[X_train.index, 'time_to_event'].values)
dtest = xgb.DMatrix(X_test, label=y_df.loc[X_test.index, 'time_to_event'].values)

params = {
    'objective': 'survival:cox',
    'eval_metric': 'cox-nloglik',
    'eta': 0.05,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
}

bst = xgb.train(params, dtrain, num_boost_round=300)

# XGBoost Cox produces risk scores as predictions
xgb_risk = bst.predict(dtest)
cox_cindex = compute_concordance_index(
    y_df.loc[X_test.index, 'event'].values,
    y_df.loc[X_test.index, 'time_to_event'].values,
    xgb_risk,
)
print(f"XGBoost Cox C-index (holdout): {cox_cindex:.3f}")


In [None]:
# Train/test split and RF model (ensure X_train/X_test defined)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

mask = X.notna().all(axis=1)
Xc = X[mask]
yc = y[mask]

X_train, X_test, y_train, y_test = train_test_split(Xc, yc, test_size=0.3, random_state=42, stratify=yc)

clf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print(f'Quick AUC (30-day classification proxy): {auc:.3f}')

imp = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print('Top 15 features:\n', imp.head(15))


In [None]:
# Build features via app.feature_engineering (includes labs when available)
from app.feature_engineering import engineer_features

X, y_df = engineer_features(cohort)
y = y_df['event'].astype(int)

print('Feature matrix shape:', X.shape)
print('Num lab-derived columns:', sum(col.startswith('"lab_"') for col in X.columns))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Load cohort from previous step
cohort = pd.read_csv('../data/processed/cohort_30d.csv', parse_dates=['admittime','dischtime','next_admittime'])

# Basic numeric/categorical feature set for quick pass
X_cols = []
for c in ['age_at_discharge','los_days']:
    if c in cohort.columns:
        X_cols.append(c)

# One-hot encode a few categoricals
cat_cols = [c for c in ['gender','admission_type','discharge_location','insurance'] if c in cohort.columns]
X = pd.get_dummies(cohort[X_cols + cat_cols], drop_first=True)
y = cohort['event'].astype(int)

# Drop rows with any NA in features
mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# AUC as quick heuristic
proba = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, proba)
print(f"Quick AUC (30-day classification proxy): {auc:.3f}")

# Feature importance
imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top 15 features:\n", imp.head(15))


Quick AUC (30-day classification proxy): 0.576
Top 15 features:
 age_at_discharge                                   0.401644
gender_M                                           0.081105
discharge_location_SKILLED NURSING FACILITY        0.060272
insurance_Medicare                                 0.050945
insurance_Other                                    0.050063
admission_type_EW EMER.                            0.049381
discharge_location_HOME                            0.048827
admission_type_DIRECT EMER.                        0.038194
discharge_location_HOME HEALTH CARE                0.034777
admission_type_OBSERVATION ADMIT                   0.031774
admission_type_URGENT                              0.028284
discharge_location_REHAB                           0.024051
discharge_location_CHRONIC/LONG TERM ACUTE CARE    0.022864
admission_type_SURGICAL SAME DAY ADMISSION         0.020727
admission_type_EU OBSERVATION                      0.013956
dtype: float64
