# MIMIC-IV Cohort Exploration
## Initial Analysis of the Master Cohort

This notebook provides an initial exploration of the generated cohort from MIMIC-IV data.

In [5]:
import pandas as pd
import numpy as np

# Load the cohort
cohort = pd.read_csv('../data/processed/cohort_labeled.csv')

print(f"Cohort shape: {cohort.shape}")
print(f"Number of ICU stays: {len(cohort):,}")
print(f"Number of unique patients: {cohort['subject_id'].nunique():,}")
print(f"\nColumns: {list(cohort.columns)}")

Cohort shape: (73181, 19)
Number of ICU stays: 73,181
Number of unique patients: 50,920

Columns: ['subject_id', 'hadm_id', 'stay_id', 'gender', 'age', 'marital_status', 'insurance', 'admission_type', 'admission_location', 'discharge_location', 'first_careunit', 'last_careunit', 'admittime', 'dischtime', 'intime', 'outtime', 'mortality', 'los_days', 'los_category']


In [6]:
# Display first few rows
cohort.head(10)

Unnamed: 0,subject_id,hadm_id,stay_id,gender,age,marital_status,insurance,admission_type,admission_location,discharge_location,first_careunit,last_careunit,admittime,dischtime,intime,outtime,mortality,los_days,los_category
0,10000032,29079034,39553978,F,52,WIDOWED,Medicaid,EW EMER.,EMERGENCY ROOM,HOME,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 12:35:00,2180-07-25 17:55:00,2180-07-23 14:00:00,2180-07-23 23:50:47,0,0.410266,0
1,10000980,26913865,39765666,F,76,MARRIED,Medicare,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 07:38:00,2189-07-03 03:00:00,2189-06-27 08:42:00,2189-06-27 20:38:27,0,0.497535,0
2,10001217,24597018,37067082,F,55,MARRIED,Other,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-18 22:56:00,2157-11-25 18:00:00,2157-11-20 19:18:02,2157-11-21 22:08:00,0,1.118032,0
3,10001217,27703517,34592300,F,55,MARRIED,Other,DIRECT EMER.,PHYSICIAN REFERRAL,HOME HEALTH CARE,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-18 16:58:00,2157-12-24 14:55:00,2157-12-19 15:42:24,2157-12-20 14:27:41,0,0.948113,0
4,10001725,25563031,31205490,F,46,MARRIED,Other,EW EMER.,PACU,HOME,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:08:00,2110-04-14 15:00:00,2110-04-11 15:52:22,2110-04-12 23:59:56,0,1.338588,0
5,10001884,26184834,37510196,F,77,MARRIED,Medicare,OBSERVATION ADMIT,EMERGENCY ROOM,DIED,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-07 20:39:00,2131-01-20 05:15:00,2131-01-11 04:20:05,2131-01-20 08:27:30,1,9.171817,2
6,10002013,23581541,39060235,F,57,SINGLE,Medicare,SURGICAL SAME DAY ADMISSION,PHYSICIAN REFERRAL,HOME HEALTH CARE,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2160-05-18 07:45:00,2160-05-23 13:30:00,2160-05-18 10:00:53,2160-05-19 17:33:33,0,1.314352,0
7,10002155,20345487,32358465,F,83,MARRIED,Other,EW EMER.,EMERGENCY ROOM,DIED,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-03-09 20:33:00,2131-03-10 01:55:00,2131-03-09 21:33:00,2131-03-10 18:09:21,1,0.858576,0
8,10002155,23822395,33685454,F,81,MARRIED,Other,EW EMER.,PROCEDURE SITE,CHRONIC/LONG TERM ACUTE CARE,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2129-08-04 12:44:00,2129-08-18 16:53:00,2129-08-04 12:45:00,2129-08-10 17:02:38,0,6.178912,1
9,10002155,28994087,31090461,F,82,MARRIED,Other,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2130-09-23 21:59:00,2130-09-29 18:55:00,2130-09-24 00:50:00,2130-09-27 22:13:41,0,3.891447,1


In [7]:
# Summary statistics for numerical columns
cohort.describe()

Unnamed: 0,subject_id,hadm_id,stay_id,age,mortality,los_days,los_category
count,73181.0,73181.0,73181.0,73181.0,73181.0,73181.0,73181.0
mean,14998190.0,24981330.0,34992740.0,64.676091,0.113814,3.451931,0.427925
std,2886067.0,2884400.0,2889261.0,16.879965,0.317587,4.92284,0.682053
min,10000030.0,20000090.0,30000150.0,18.0,0.0,0.00125,0.0
25%,12491260.0,22483350.0,32489350.0,54.0,0.0,1.084225,0.0
50%,14998940.0,24969640.0,34993890.0,66.0,0.0,1.926782,0.0
75%,17513270.0,27471800.0,37488400.0,77.0,0.0,3.713322,1.0
max,19999990.0,29999830.0,39999810.0,102.0,1.0,110.23228,2.0


## Target Variable Analysis

In [8]:
# Mortality distribution
print("=" * 60)
print("MORTALITY DISTRIBUTION")
print("=" * 60)
print(cohort['mortality'].value_counts())
print(f"\nMortality rate: {cohort['mortality'].mean() * 100:.2f}%")

print("\n" + "=" * 60)
print("LENGTH OF STAY DISTRIBUTION")
print("=" * 60)
print(f"Mean LOS: {cohort['los_days'].mean():.2f} days")
print(f"Median LOS: {cohort['los_days'].median():.2f} days")
print(f"Min LOS: {cohort['los_days'].min():.2f} days")
print(f"Max LOS: {cohort['los_days'].max():.2f} days")

print("\n" + "=" * 60)
print("LOS CATEGORY DISTRIBUTION")
print("=" * 60)
print("0: Short (<3 days)")
print("1: Medium (3-7 days)")
print("2: Long (>7 days)")
print("\n", cohort['los_category'].value_counts().sort_index())

MORTALITY DISTRIBUTION
mortality
0    64852
1     8329
Name: count, dtype: int64

Mortality rate: 11.38%

LENGTH OF STAY DISTRIBUTION
Mean LOS: 3.45 days
Median LOS: 1.93 days
Min LOS: 0.00 days
Max LOS: 110.23 days

LOS CATEGORY DISTRIBUTION
0: Short (<3 days)
1: Medium (3-7 days)
2: Long (>7 days)

 los_category
0    49929
1    15188
2     8064
Name: count, dtype: int64
