In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('icu_merged.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61302 entries, 0 to 61301
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   icustay_id            61302 non-null  int64  
 1   subject_id            61302 non-null  int64  
 2   hadm_id               61302 non-null  int64  
 3   intime                61302 non-null  object 
 4   outtime               61302 non-null  object 
 5   dbsource              61302 non-null  object 
 6   icu_los               61302 non-null  float64
 7   first_careunit        61302 non-null  object 
 8   admittime             61302 non-null  object 
 9   dischtime             61302 non-null  object 
 10  hosp_deathtime        6504 non-null   object 
 11  admission_type        61302 non-null  object 
 12  insurance             61302 non-null  object 
 13  marital_status        61302 non-null  object 
 14  diagnosis             61301 non-null  object 
 15  ethnic_group       

In [4]:
# CALCULATE AGE AT ADMISSION

df['admittime'] = pd.to_datetime(df['admittime'])
df['dob'] = pd.to_datetime(df['dob'])

# Age at admission
df['age'] = (df['admittime'].dt.year - df['dob'].dt.year)

In [5]:
(df['age'] >= 120).sum()

np.int64(2713)

In [6]:
# In MIMIC DOB for patients aged over 89 were shifted to obscure their true age and comply with HIPAA regulations: these patients appear in the database with ages of over 300 years.
df.loc[df['age'] > 89, 'age'] = 90
(df['age'] >= 120).sum()

np.int64(0)

In [7]:
# Compute Time to Death (Post-Discharge Survival Time)
df['dod'] = pd.to_datetime(df['dod'])
df['dischtime'] = pd.to_datetime(df['dischtime'])

df['days_to_death'] = (df['dod'].dt.floor('D') - df['dischtime'].dt.floor('D')).dt.days

In [8]:
(df['days_to_death']<0).sum()

np.int64(12)

In [9]:
# remove entries with negative days_to_death 
df = df[(df['days_to_death'] >= 0) | (df['days_to_death'].isna())].copy()
(df['days_to_death']<0).sum()

np.int64(0)

In [10]:
# Define event flag for survival analysis.
df['event'] = (df['expire_flag'] == 1).astype(int)

## Preprocess - Diagnosis

In [11]:
df['diagnosis'] = df['diagnosis'].str.upper().str.replace(r'[^A-Z\s]', '', regex=True).str.strip()

In [12]:
df['diagnosis'].value_counts().nlargest(20)

diagnosis
NEWBORN                                                    7824
PNEUMONIA                                                  1655
SEPSIS                                                     1284
CONGESTIVE HEART FAILURE                                   1025
CORONARY ARTERY DISEASE                                     904
CHEST PAIN                                                  816
ALTERED MENTAL STATUS                                       753
INTRACRANIAL HEMORRHAGE                                     749
GASTROINTESTINAL BLEED                                      731
UPPER GI BLEED                                              612
ABDOMINAL PAIN                                              612
CORONARY ARTERY DISEASECORONARY ARTERY BYPASS GRAFT SDA     590
FEVER                                                       554
SP FALL                                                     477
DIABETIC KETOACIDOSIS                                       470
CORONARY ARTERY DISEASECORONAR

In [13]:
df['diagnosis'].nunique()

14622

In [14]:
df['diagnosis'].str.contains('TUMOR', na= False).sum()

np.int64(307)

In [15]:
import re

def normalize_diagnosis(x):
    x = str(x)
    
    # --- 1. Sepsis / Infection ---
    if re.search(r'SEPSIS|CELLULITIS|URINARY|PYELO|CHOLANGITIS|INFECTION|SEPTIC|FEVER', x):
        return 'SEPSIS / INFECTION'
    
    # --- 2. Respiratory / Pulmonary ---
    elif re.search(r'RESPIRATORY|PNEUMONIA|HYPOXIA|HEMOPTYSIS|PULMONARY|PLEURAL|ASTHMA|COPD|DYSPNEA|BREATH|TRACHEAL|CHRONIC OBST', x):
        return 'RESPIRATORY FAILURE / PULMONARY'
    
    # --- 3. Cardiac / Vascular ---
    elif re.search(r'HEART|CHF|CARDIAC|CORONARY|CHEST|HYPOTENSION|MYOCARDIAL|INFARCTION|ANGINA|AORTIC|BRADY|PERICARDIAL|STEMI|FIBRILLATION|HYPERTEN|MI|ARREST', x):
        return 'CARDIAC / VASCULAR'
    
    # --- 4. Neurological ---
    elif re.search(r'STROKE|HEMORRHAGE|INTRACRANIAL|SUBDURAL|BRAIN|TUMOR|MENTAL|SEIZURE|SYNCOP|WEAKNESS|ENCEPHALOPATHY|CEREBROVASCULAR|SAH|CVA|ICH|HEAD|CEREBRAL', x):
        return 'NEUROLOGICAL'
    
    # --- 5. Gastrointestinal / Hepatic / Pancreatic ---
    elif re.search(r'GI|GASTRO|BLEED|ABDOM|PAIN|PANCREA|LIVER|HEPATIC|BOWEL|OBSTRUCTION|CHOLECYST|ESOPHAG|DEHYDRATION|CIRRHOSIS|ASCITES|COLITIS|NAUSEA', x):
        return 'GASTROINTESTINAL / HEPATIC / PANCREATIC'
    
    # --- 6. Renal / Metabolic ---
    elif re.search(r'RENAL|KIDNEY|HYPERKAL|HYPONAT|DIABETIC|KETOACIDOSIS|HYPOGLYCEMIA|HYPERGLYCEMIA', x):
        return 'RENAL / METABOLIC'
    
    # --- 7. Hematologic / Anemia ---
    elif re.search(r'ANEMIA', x):
        return 'HEMATOLOGIC / ANEMIA'
    
    # --- 8. Trauma / Surgical ---
    elif re.search(r'FALL|TRAUMA|VEHICLE|FRACTURE|STRUCK', x):
        return 'TRAUMA / SURGICAL'
    
    # --- 9. Oncology ---
    elif re.search(r'CANCER|CA|TUMOR|MASS|LYMPHOMA|LEUKEMIA', x):
        return 'ONCOLOGY'
    
    # ---10. Newborn / Pediatric ---
    elif re.search(r'NEWBORN|NEONATE|PREMATUR', x):
        return 'NEWBORN / PEDIATRIC'
    
    # ---11. Failure to Thrive ---
    elif re.search(r'FAILURE TO THRIVE|DEBILITY', x):
        return 'FAILURE TO THRIVE / MISC'
    
    elif re.search(r'(OVERDOSE|WITHDRAWAL)', x, re.I):
        return 'TOXIC / SUBSTANCE RELATED'
        
    # ---12. Other ---
    else:
        return 'OTHER'

df['diag_category'] = df['diagnosis'].apply(normalize_diagnosis)


In [16]:
df['diag_category'].value_counts(dropna = False)

diag_category
CARDIAC / VASCULAR                         17355
NEWBORN / PEDIATRIC                         7937
GASTROINTESTINAL / HEPATIC / PANCREATIC     6659
NEUROLOGICAL                                6593
RESPIRATORY FAILURE / PULMONARY             6052
OTHER                                       5850
SEPSIS / INFECTION                          4271
TRAUMA / SURGICAL                           2420
ONCOLOGY                                    2128
RENAL / METABOLIC                           1382
TOXIC / SUBSTANCE RELATED                    558
FAILURE TO THRIVE / MISC                      85
Name: count, dtype: int64

In [17]:
df[df['diag_category'] == 'OTHER']['diagnosis'].value_counts().head(30)


diagnosis
UNRESPONSIVE                   77
MORBID OBESITYSDA              57
ANEURYSMSDA                    49
STATUS EPILEPTICUS             42
SPLENIC LACERATION             37
SCOLIOSISSDA                   37
GUN SHOT WOUND                 36
LUMBAR STENOSISSDA             31
SUBARACHNOID HEMATOMA          30
HEMATURIA                      29
ANEURYSM                       28
PNEUMOTHORAX                   28
DIVERTICULITIS                 28
VENTRAL HERNIASDA              27
SP ASSAULT                     26
BRIGHT RED BLOOD PER RECTUM    26
EPIDURAL ABSCESS               26
HEMOTHORAX                     25
EPIDURAL HEMATOMA              24
STAB WOUND                     23
HYDROCEPHALUS                  23
JAUNDICE                       22
TRACHEOBRONCHOMALACIASDA       22
SP MVA                         22
DIARRHEA                       21
DEEP VEIN THROMBOSIS           21
RAPID AFIB                     21
ABSCESS                        20
HEPATITIS                      19
DIVE

In [18]:
# Final Sanity Check
print(df.shape)
print(df.isnull().sum())
print(df['expire_flag'].value_counts())

(61290, 26)
icustay_id                  0
subject_id                  0
hadm_id                     0
intime                      0
outtime                     0
dbsource                    0
icu_los                     0
first_careunit              0
admittime                   0
dischtime                   0
hosp_deathtime          54796
admission_type              0
insurance                   0
marital_status              0
diagnosis                   1
ethnic_group                0
time_spent                  0
hospital_expire_flag        0
gender                      0
dob                         0
dod                     37256
expire_flag                 0
age                         0
days_to_death           37256
event                       0
diag_category               0
dtype: int64
expire_flag
0    37256
1    24034
Name: count, dtype: int64


In [19]:
df.to_csv('icu_final_cleaned.csv', index=False)