In [1]:
# ------------------------------------------------------------
!pip install lifelines

# Try to install scikit-survival (for time-dependent AUC)
# This may fail on some environments depending on available wheels / system libs.
!pip -q install scikit-survival || echo "scikit-survival install failed (will skip time-dependent AUC)."


# ------------------------------------------------------------
import os
import random
import copy

import numpy as np
import pandas as pd

from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

from sklearn.model_selection import train_test_split


# ------------------------------------------------------------
RNG_SEED = 42

def seed_all(seed: int = RNG_SEED) -> None:
    """
    Set random seeds for Python, NumPy, and OS hashing
    to make results reproducible.
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

seed_all(RNG_SEED)


# ------------------------------------------------------------
Data001 = pd.read_csv(
    "/content/drive/MyDrive/Paper(2025Dec)_SimulatedCVD/Data001_ReadyForCoxPH.csv"
)

# Drop index-like columns if present
for col in ["Unnamed: 0", "index"]:
    if col in Data001.columns:
        Data001 = Data001.drop(columns=[col])

df_raw = copy.copy(Data001)


# ------------------------------------------------------------
train_df, test_df = train_test_split(
    df_raw,
    test_size=0.2,
    random_state=RNG_SEED,
    stratify=df_raw["cvd_event"],
)

print(f"Train size: {train_df.shape[0]}, Test size: {test_df.shape[0]}")


# ------------------------------------------------------------
def build_design_matrix(df: pd.DataFrame, drop_cols=None):
    df = df.copy()

    df["Age_c"] = df["Age"] - 30.0

    irsd_dummies = pd.get_dummies(
        df["IRSD_quintile"],
        prefix="irsd",
        drop_first=False
    )
    if "irsd_5" in irsd_dummies.columns:
        irsd_dummies = irsd_dummies.drop(columns=["irsd_5"])

    smoke_dummies = pd.get_dummies(
        df["smoking_status"],
        prefix="smoke",
        drop_first=False
    )
    smoke_dummies = smoke_dummies.fillna(0)
    if "smoke_non" in smoke_dummies.columns:
        smoke_dummies = smoke_dummies.drop(columns=["smoke_non"])

    for col in ["AF", "CKD", "diabetes", "cvd_event"]:
        df[col] = df[col].astype(int)

    X = pd.concat(
        [
            df[["Age_c", "AF", "CKD", "diabetes", "HbA1c", "eGFR", "SBP"]],
            smoke_dummies,
            irsd_dummies,
        ],
        axis=1
    )

    cols_to_drop = []
    if drop_cols is None:
        events = df["cvd_event"].astype(bool)

        for col in X.columns:
            v_all      = X[col].var()
            v_event    = X.loc[events, col].var() if events.sum() > 0 else 0.0
            v_nonevent = X.loc[~events, col].var() if (~events).sum() > 0 else 0.0

            if (v_all < 1e-6) or (v_event < 1e-6) or (v_nonevent < 1e-6):
                cols_to_drop.append(col)
    else:
        cols_to_drop = drop_cols

    X_reduced = X.drop(columns=cols_to_drop, errors="ignore")
    return X_reduced, cols_to_drop


# ------------------------------------------------------------
X_train, cols_to_drop = build_design_matrix(train_df, drop_cols=None)

cox_train_df = pd.concat(
    [
        train_df[["cvd_time", "cvd_event"]].reset_index(drop=True),
        X_train.reset_index(drop=True),
    ],
    axis=1
).dropna()

y_time_train  = cox_train_df["cvd_time"].values
y_event_train = cox_train_df["cvd_event"].values.astype(int)
X_train_final = cox_train_df.drop(columns=["cvd_time", "cvd_event"])

print(f"Final training matrix shape: {X_train_final.shape}")

cph = CoxPHFitter(penalizer=0.05)
cph.fit(
    cox_train_df,
    duration_col="cvd_time",
    event_col="cvd_event",
    show_progress=True,
)

print("\n=== CoxPH summary (training) ===")
cph.print_summary()


# ------------------------------------------------------------
X_test, _ = build_design_matrix(test_df, drop_cols=cols_to_drop)

cox_test_df = pd.concat(
    [
        test_df[["cvd_time", "cvd_event"]].reset_index(drop=True),
        X_test.reset_index(drop=True),
    ],
    axis=1
).dropna()

y_time_test  = cox_test_df["cvd_time"].values
y_event_test = cox_test_df["cvd_event"].values.astype(int)
X_test_final = cox_test_df.drop(columns=["cvd_time", "cvd_event"])

print(f"Final test matrix shape: {X_test_final.shape}")

train_risk = cph.predict_partial_hazard(X_train_final).values.ravel()
test_risk  = cph.predict_partial_hazard(X_test_final).values.ravel()


# ------------------------------------------------------------
# Time-to-event discrimination (Blog 04 core)
# NOTE: Cox partial hazard is "higher = worse", so we pass -risk to concordance_index.

cindex_train = concordance_index(y_time_train, -train_risk, y_event_train)
cindex_test  = concordance_index(y_time_test,  -test_risk,  y_event_test)

print("\n=== Time-to-event discrimination (survival-native) ===")
print(f"Harrell C-index (train): {cindex_train:.3f}")
print(f"Harrell C-index (test) : {cindex_test:.3f}")


# ------------------------------------------------------------
# Optional: time-dependent AUC (cumulative dynamic AUC)
# We now TRY import sksurv; if install failed, we clearly report.

try:
    from sksurv.util import Surv
    from sksurv.metrics import cumulative_dynamic_auc

    y_train_sksurv = Surv.from_arrays(event=y_event_train.astype(bool), time=y_time_train)
    y_test_sksurv  = Surv.from_arrays(event=y_event_test.astype(bool),  time=y_time_test)

    # Choose time grid (example: quantiles of TEST follow-up)
    # Replace with clinically meaningful horizons if you prefer (e.g., [365, 730, 1825]).
    times = np.quantile(y_time_test, [0.2, 0.4, 0.6, 0.8]).astype(float)
    times = np.unique(np.clip(times, 1e-6, None))

    aucs, mean_auc = cumulative_dynamic_auc(
        y_train_sksurv,
        y_test_sksurv,
        test_risk,
        times
    )

    print("\n=== Time-dependent discrimination (optional) ===")
    for t, a in zip(times, aucs):
        print(f"AUC(t={t:.1f}) : {a:.3f}")
    print(f"Mean AUC over selected times: {mean_auc:.3f}")

except Exception as e:
    print("\n[Optional] Time-dependent AUC skipped (scikit-survival not available or install failed).")
    print(f"Reason: {type(e).__name__}: {e}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.1/222.1 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hTrain size: 40000, Test size: 10000
Final training matrix shape: (40000, 13)
Iteration 1: norm_delta = 1.07e+00, step_size = 0.9500, log_lik = -16867.06666, newton_decrement = 2.64e+03, seconds_since_start = 0.6
Iteration 2: norm_delta = 6.65e-01, step_size = 0.9500, log_lik = -16167.55102, newton_decrement = 1.43e+03, seconds_since_start = 1.1
Iteration 3: norm_delta = 2.34e-01, step_size = 0.9500, log_lik = -15314.75821, newton_decrement = 2.25e+02, seconds_since_start = 1.7
Iteration 4: norm_delta = 2.46e-02, step_size = 1.0000, log_lik = -15111.86949, newton_decrement = 3.60e+00, seconds_since_start = 2.2
Iteration 5: norm_delt

0,1
model,lifelines.CoxPHFitter
duration col,'cvd_time'
event col,'cvd_event'
penalizer,0.05
l1 ratio,0.0
baseline estimation,breslow
number of observations,40000
number of events observed,1607
partial log-likelihood,-15108.26
time fit was run,2026-02-16 00:47:47 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Age_c,0.03,1.03,0.0,0.03,0.03,1.03,1.03,0.0,20.9,<0.005,319.95
AF,1.12,3.07,0.15,0.84,1.41,2.31,4.09,0.0,7.73,<0.005,46.37
CKD,-0.04,0.96,0.15,-0.33,0.25,0.72,1.28,0.0,-0.29,0.77,0.37
diabetes,1.42,4.14,0.05,1.31,1.53,3.72,4.61,0.0,25.88,<0.005,488.26
HbA1c,0.32,1.38,0.02,0.29,0.36,1.34,1.43,0.0,20.24,<0.005,300.29
eGFR,-0.02,0.98,0.0,-0.02,-0.01,0.98,0.99,0.0,-6.65,<0.005,34.97
SBP,0.01,1.01,0.0,0.01,0.01,1.01,1.01,0.0,10.15,<0.005,77.97
smoke_current,0.16,1.18,0.05,0.06,0.27,1.06,1.31,0.0,3.01,<0.005,8.56
smoke_ex,0.14,1.15,0.04,0.06,0.23,1.06,1.26,0.0,3.25,<0.005,9.75
irsd_1,0.01,1.01,0.04,-0.07,0.1,0.94,1.1,0.0,0.35,0.73,0.46

0,1
Concordance,0.87
Partial AIC,30242.52
log-likelihood ratio test,3517.62 on 13 df
-log2(p) of ll-ratio test,inf


Final test matrix shape: (10000, 13)

=== Time-to-event discrimination (survival-native) ===
Harrell C-index (train): 0.871
Harrell C-index (test) : 0.873

=== Time-dependent discrimination (optional) ===
AUC(t=4.4) : 0.881
AUC(t=4.7) : 0.875
AUC(t=5.0) : 0.876
AUC(t=5.3) : 0.886
Mean AUC over selected times: 0.881
