In [1]:
# Ensure repository root is on sys.path for `import app.*`
import sys
from pathlib import Path
repo_root = (Path.cwd() / '..').resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print('Added to sys.path:', repo_root)


Added to sys.path: C:\Users\Golds\Downloads\survival-readmission


# 02 - Cohort Definition

This notebook defines the cohort for survival analysis, including:
- Inclusion/exclusion criteria
- Index event definition
- Censoring rules
- Event definitions (readmission)


In [2]:
# Import libraries
import pandas as pd
import numpy as np
from app.data_loader import load_admissions, load_patients
from app.feature_engineering import define_cohort


## Objective

Build a 30-day readmission survival frame from MIMIC-IV Demo:
- Index = each hospital discharge
- event = 1 if 0 < days_to_next ≤ 30, else 0 (right-censored at 30 days)
- time_to_event = min(days_to_next, 30)
- Exclusions documented (death before readmission window, optional newborns)

Assumes files from `app.download_demo` under `../data/raw/mimic-iv-demo/hosp/`.


In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import warnings
warnings.filterwarnings('ignore')

# Prefer env var (no hard-coded absolute paths); fallback to repo-relative path
DATA_DIR = Path(os.getenv('MIMIC_DEMO_DIR', '../data/raw/mimic-iv-demo')) / 'hosp'
print('Using DATA_DIR =', DATA_DIR)

# Load required tables
admissions = pd.read_csv(DATA_DIR / 'admissions.csv.gz', compression='gzip')
patients = pd.read_csv(DATA_DIR / 'patients.csv.gz', compression='gzip')

# Parse timestamps
for col in ['admittime', 'dischtime', 'deathtime', 'edregtime', 'edouttime']:
    if col in admissions.columns:
        admissions[col] = pd.to_datetime(admissions[col])

# Keep only rows with valid discharge
admissions = admissions[admissions['dischtime'].notna()].copy()

print(f"Admissions: {len(admissions):,}")
print(f"Patients: {len(patients):,}")


Using DATA_DIR = ..\data\raw\mimic-iv-demo\hosp
Admissions: 275
Patients: 100


In [4]:
# Merge demographics (anchor_age/year to compute age at discharge)
cohort = admissions.merge(patients, on='subject_id', how='left')
cohort['age_at_discharge'] = cohort['anchor_age'] + (cohort['dischtime'].dt.year - cohort['anchor_year'])

# Sort per patient by admission time
cohort = cohort.sort_values(['subject_id', 'admittime']).reset_index(drop=True)

# Identify next admission per patient
cohort['next_admittime'] = cohort.groupby('subject_id')['admittime'].shift(-1)

# Compute elapsed days from current discharge to next admission
cohort['days_to_next'] = (cohort['next_admittime'] - cohort['dischtime']).dt.total_seconds() / (24*3600)

# Filter out likely same-hospitalization transfers (next admit very close to discharge)
cohort.loc[cohort['days_to_next'] <= (6/24), 'days_to_next'] = np.nan  # <= 6 hours -> treat as no new admission

# Define survival outcome
cohort['event'] = ((cohort['days_to_next'] > 0) & (cohort['days_to_next'] <= 30)).astype(int)
cohort['time_to_event'] = np.where(cohort['days_to_next'].notna(), np.minimum(cohort['days_to_next'], 30), 30)

# Exclusion: In-hospital deaths -> no candidacy for readmission
if 'hospital_expire_flag' in cohort.columns:
    cohort = cohort[cohort['hospital_expire_flag'] == 0].copy()

# Exclusion: death before day 30 (treat as censored at death time)
if 'deathtime' in cohort.columns:
    time_to_death = (cohort['deathtime'] - cohort['dischtime']).dt.total_seconds() / (24*3600)
    # If death occurs before next admission and within 30 days, censor at death
    mask_death_before30 = (time_to_death.notna()) & (time_to_death >= 0) & (time_to_death < cohort['time_to_event'])
    cohort.loc[mask_death_before30, 'event'] = 0
    cohort.loc[mask_death_before30, 'time_to_event'] = time_to_death[mask_death_before30]

# Clip time_to_event to [0, 30]
cohort['time_to_event'] = cohort['time_to_event'].clip(lower=0, upper=30)

print(cohort[['subject_id','hadm_id','dischtime','next_admittime','days_to_next','event','time_to_event']].head())
print('\nEvent rate (30d):', cohort['event'].mean().round(3))
print('Median time_to_event:', np.median(cohort['time_to_event']).round(2))


   subject_id   hadm_id           dischtime      next_admittime  days_to_next  \
0    10000032  22595853 2180-05-07 17:15:00 2180-06-26 18:27:00     50.050000   
1    10000032  22841357 2180-06-27 18:49:00 2180-07-23 12:35:00     25.740278   
2    10000032  29079034 2180-07-25 17:55:00 2180-08-05 23:44:00     11.242361   
3    10000032  25742920 2180-08-07 17:50:00                 NaT           NaN   
4    10001217  24597018 2157-11-25 18:00:00 2157-12-18 16:58:00     22.956944   

   event  time_to_event  
0      0      30.000000  
1      1      25.740278  
2      1      11.242361  
3      0      30.000000  
4      1      22.956944  

Event rate (30d): 0.192
Median time_to_event: 30.0


In [5]:
# Select minimum output columns
cols = [
    'subject_id','hadm_id','admittime','dischtime','age_at_discharge',
    'next_admittime','days_to_next','event','time_to_event',
    'admission_type','discharge_location','insurance','ethnicity','gender'
]
existing_cols = [c for c in cols if c in cohort.columns]
cohort_out = cohort[existing_cols].copy()

# Save to processed
out_dir = Path('../data/processed')
out_dir.mkdir(parents=True, exist_ok=True)
cohort_path = out_dir / 'cohort_30d.csv'
cohort_out.to_csv(cohort_path, index=False)
print(f"Saved cohort: {cohort_path}")
print(cohort_out.head())


Saved cohort: ..\data\processed\cohort_30d.csv
   subject_id   hadm_id           admittime           dischtime  \
0    10000032  22595853 2180-05-06 22:23:00 2180-05-07 17:15:00   
1    10000032  22841357 2180-06-26 18:27:00 2180-06-27 18:49:00   
2    10000032  29079034 2180-07-23 12:35:00 2180-07-25 17:55:00   
3    10000032  25742920 2180-08-05 23:44:00 2180-08-07 17:50:00   
4    10001217  24597018 2157-11-18 22:56:00 2157-11-25 18:00:00   

   age_at_discharge      next_admittime  days_to_next  event  time_to_event  \
0                52 2180-06-26 18:27:00     50.050000      0      30.000000   
1                52 2180-07-23 12:35:00     25.740278      1      25.740278   
2                52 2180-08-05 23:44:00     11.242361      1      11.242361   
3                52                 NaT           NaN      0      30.000000   
4                55 2157-12-18 16:58:00     22.956944      1      22.956944   

  admission_type discharge_location insurance gender  
0         URGENT    

## Notes, Pitfalls, and Future Enhancements

- Transfers vs. true readmissions: we exclude next admissions within 6 hours of discharge to reduce false positives due to intra-encounter transitions.
- Right-censoring: `event=0` when no readmission within 30 days; `time_to_event` is truncated at 30.
- Death before readmission: currently treated as censored at death time; consider competing risks in later iterations.
- Planned readmissions: not implemented here; consider excluding if a heuristic is developed.
- Newborns: exclude if scope is adult readmissions only.

Output: `../data/processed/cohort_30d.csv` for modeling and EDA cross-checks.
