# Compute each of the 10 predictors for 30-Day Readmission (Y1)
This notebook calculates the 10 predictors required for the analysis of 30-day readmissions, combining them into a single dataset for modeling.


In [1]:
import pandas as pd
import numpy as np

In [3]:
# Paths to cleaned data
data_dir = "processed_data"
admissions_cleaned = pd.read_csv(f"{data_dir}/admissions_cleaned.csv")
patients_cleaned = pd.read_csv(f"{data_dir}/patients_cleaned.csv")
diagnoses_cleaned = pd.read_csv(f"{data_dir}/diagnoses_cleaned.csv")
labevents_cleaned = pd.read_csv(f"{data_dir}/labevents_cleaned.csv")
procedures_cleaned = pd.read_csv(f"{data_dir}/procedures_cleaned.csv")
prescriptions_cleaned = pd.read_csv(f"{data_dir}/prescriptions_cleaned.csv")

  labevents_cleaned = pd.read_csv(f"{data_dir}/labevents_cleaned.csv")
  prescriptions_cleaned = pd.read_csv(f"{data_dir}/prescriptions_cleaned.csv")


Set Datatypes

In [4]:
# Convert date fields to datetime64
admissions_cleaned['admittime'] = pd.to_datetime(admissions_cleaned['admittime'])
admissions_cleaned['dischtime'] = pd.to_datetime(admissions_cleaned['dischtime'])
admissions_cleaned['deathtime'] = pd.to_datetime(admissions_cleaned['deathtime'])
admissions_cleaned['edregtime'] = pd.to_datetime(admissions_cleaned['edregtime'])
admissions_cleaned['edouttime'] = pd.to_datetime(admissions_cleaned['edouttime'])

# Convert categorical fields to category
admissions_cleaned['admission_type'] = admissions_cleaned['admission_type'].astype('category')
admissions_cleaned['admit_provider_id'] = admissions_cleaned['admit_provider_id'].astype('category')
admissions_cleaned['admission_location'] = admissions_cleaned['admission_location'].astype('category')
admissions_cleaned['discharge_location'] = admissions_cleaned['discharge_location'].astype('category')
admissions_cleaned['insurance'] = admissions_cleaned['insurance'].astype('category')
admissions_cleaned['language'] = admissions_cleaned['language'].astype('category')
admissions_cleaned['marital_status'] = admissions_cleaned['marital_status'].astype('category')
admissions_cleaned['race'] = admissions_cleaned['race'].astype('category')

# Convert numerical fields to appropriate types
admissions_cleaned['subject_id'] = admissions_cleaned['subject_id'].astype(int)
admissions_cleaned['hadm_id'] = admissions_cleaned['hadm_id'].astype(int)
admissions_cleaned['hospital_expire_flag'] = admissions_cleaned['hospital_expire_flag'].astype(int)

# Convert date fields to datetime64 for patients
patients_cleaned['dod'] = pd.to_datetime(patients_cleaned['dod'])

# Convert categorical fields to category for patients
patients_cleaned['gender'] = patients_cleaned['gender'].astype('category')
patients_cleaned['anchor_year_group'] = patients_cleaned['anchor_year_group'].astype('category')

# Convert numerical fields to appropriate types for patients
patients_cleaned['subject_id'] = patients_cleaned['subject_id'].astype(int)
patients_cleaned['anchor_age'] = patients_cleaned['anchor_age'].astype(int)
patients_cleaned['anchor_year'] = patients_cleaned['anchor_year'].astype(int)

# Convert date fields to datetime64 for diagnoses
# No date fields in diagnoses

# Convert categorical fields to category for diagnoses
diagnoses_cleaned['icd_code'] = diagnoses_cleaned['icd_code'].astype('category')

# Convert numerical fields to appropriate types for diagnoses
diagnoses_cleaned['subject_id'] = diagnoses_cleaned['subject_id'].astype(int)
diagnoses_cleaned['hadm_id'] = diagnoses_cleaned['hadm_id'].astype(int)
diagnoses_cleaned['seq_num'] = diagnoses_cleaned['seq_num'].astype(int)
diagnoses_cleaned['icd_version'] = diagnoses_cleaned['icd_version'].astype(int)

# Convert date fields to datetime64 for labevents
labevents_cleaned['charttime'] = pd.to_datetime(labevents_cleaned['charttime'])
labevents_cleaned['storetime'] = pd.to_datetime(labevents_cleaned['storetime'])

# Convert categorical fields to category for labevents
labevents_cleaned['order_provider_id'] = labevents_cleaned['order_provider_id'].astype('category')
labevents_cleaned['value'] = labevents_cleaned['value'].astype('category')
labevents_cleaned['valueuom'] = labevents_cleaned['valueuom'].astype('category')
labevents_cleaned['flag'] = labevents_cleaned['flag'].astype('category')
labevents_cleaned['priority'] = labevents_cleaned['priority'].astype('category')
labevents_cleaned['comments'] = labevents_cleaned['comments'].astype('category')

# Convert numerical fields to appropriate types for labevents
labevents_cleaned['labevent_id'] = labevents_cleaned['labevent_id'].astype(int)
labevents_cleaned['subject_id'] = labevents_cleaned['subject_id'].astype(int)
labevents_cleaned['hadm_id'] = labevents_cleaned['hadm_id'].astype(float)
labevents_cleaned['specimen_id'] = labevents_cleaned['specimen_id'].astype(int)
labevents_cleaned['itemid'] = labevents_cleaned['itemid'].astype(int)
labevents_cleaned['valuenum'] = labevents_cleaned['valuenum'].astype(float)
labevents_cleaned['ref_range_lower'] = labevents_cleaned['ref_range_lower'].astype(float)
labevents_cleaned['ref_range_upper'] = labevents_cleaned['ref_range_upper'].astype(float)

# Convert date fields to datetime64 for procedures
procedures_cleaned['chartdate'] = pd.to_datetime(procedures_cleaned['chartdate'])

# Convert categorical fields to category for procedures
procedures_cleaned['icd_code'] = procedures_cleaned['icd_code'].astype('category')

# Convert numerical fields to appropriate types for procedures
procedures_cleaned['subject_id'] = procedures_cleaned['subject_id'].astype(int)
procedures_cleaned['hadm_id'] = procedures_cleaned['hadm_id'].astype(int)
procedures_cleaned['seq_num'] = procedures_cleaned['seq_num'].astype(int)
procedures_cleaned['icd_version'] = procedures_cleaned['icd_version'].astype(int)

# Convert columns to appropriate data types for prescriptions
prescriptions_cleaned["hadm_id"] = prescriptions_cleaned["hadm_id"].astype(int)
prescriptions_cleaned["subject_id"] = prescriptions_cleaned["subject_id"].astype(int)
prescriptions_cleaned["drug"] = prescriptions_cleaned["drug"].astype(str)
prescriptions_cleaned["drug_type"] = prescriptions_cleaned["drug_type"].astype(str)
prescriptions_cleaned["formulary_drug_cd"] = prescriptions_cleaned["formulary_drug_cd"].astype(str)
prescriptions_cleaned["gsn"] = prescriptions_cleaned["gsn"].astype(str)
prescriptions_cleaned["ndc"] = prescriptions_cleaned["ndc"].astype(str)

# Convert datetime columns for prescriptions
prescriptions_cleaned["starttime"] = pd.to_datetime(prescriptions_cleaned["starttime"], errors="coerce")
prescriptions_cleaned["stoptime"] = pd.to_datetime(prescriptions_cleaned["stoptime"], errors="coerce")

### X1​: Charlson Comorbidity Index (CCI) (Categorical) — A measure of overall patient comorbidity; higher scores suggest a greater risk of complications and readmission.

In [5]:
# Complete ICD-9 and ICD-10 to Comorbidity Mappings
icd_to_comorbidity = {
    # Myocardial Infarction
    "410": "myocardial_infarction",  # ICD-9
    "412": "myocardial_infarction",  # ICD-9
    "I21": "myocardial_infarction",  # ICD-10
    "I22": "myocardial_infarction",  # ICD-10

    # Congestive Heart Failure
    "428": "congestive_heart_failure",  # ICD-9
    "I50": "congestive_heart_failure",  # ICD-10

    # Peripheral Vascular Disease
    "443.9": "peripheral_vascular_disease",  # ICD-9
    "441": "peripheral_vascular_disease",  # ICD-9
    "I73.9": "peripheral_vascular_disease",  # ICD-10
    "I70": "peripheral_vascular_disease",  # ICD-10

    # Cerebrovascular Disease
    "430": "cerebrovascular_disease",  # ICD-9
    "431": "cerebrovascular_disease",  # ICD-9
    "432": "cerebrovascular_disease",  # ICD-9
    "433": "cerebrovascular_disease",  # ICD-9
    "434": "cerebrovascular_disease",  # ICD-9
    "435": "cerebrovascular_disease",  # ICD-9
    "I60": "cerebrovascular_disease",  # ICD-10
    "I61": "cerebrovascular_disease",  # ICD-10
    "I62": "cerebrovascular_disease",  # ICD-10
    "I63": "cerebrovascular_disease",  # ICD-10
    "I64": "cerebrovascular_disease",  # ICD-10

    # Dementia
    "290": "dementia",  # ICD-9
    "F00": "dementia",  # ICD-10
    "F01": "dementia",  # ICD-10
    "F03": "dementia",  # ICD-10
    "G30": "dementia",  # ICD-10

    # Chronic Pulmonary Disease
    "490": "chronic_pulmonary_disease",  # ICD-9
    "491": "chronic_pulmonary_disease",  # ICD-9
    "492": "chronic_pulmonary_disease",  # ICD-9
    "493": "chronic_pulmonary_disease",  # ICD-9
    "494": "chronic_pulmonary_disease",  # ICD-9
    "495": "chronic_pulmonary_disease",  # ICD-9
    "496": "chronic_pulmonary_disease",  # ICD-9
    "J40": "chronic_pulmonary_disease",  # ICD-10
    "J41": "chronic_pulmonary_disease",  # ICD-10
    "J42": "chronic_pulmonary_disease",  # ICD-10
    "J43": "chronic_pulmonary_disease",  # ICD-10
    "J44": "chronic_pulmonary_disease",  # ICD-10

    # Rheumatic Disease
    "710": "rheumatic_disease",  # ICD-9
    "714": "rheumatic_disease",  # ICD-9
    "725": "rheumatic_disease",  # ICD-9
    "M05": "rheumatic_disease",  # ICD-10
    "M06": "rheumatic_disease",  # ICD-10
    "M32": "rheumatic_disease",  # ICD-10

    # Peptic Ulcer Disease
    "531": "peptic_ulcer_disease",  # ICD-9
    "532": "peptic_ulcer_disease",  # ICD-9
    "533": "peptic_ulcer_disease",  # ICD-9
    "K25": "peptic_ulcer_disease",  # ICD-10
    "K26": "peptic_ulcer_disease",  # ICD-10
    "K27": "peptic_ulcer_disease",  # ICD-10

    # Mild Liver Disease
    "571.2": "mild_liver_disease",  # ICD-9
    "571.5": "mild_liver_disease",  # ICD-9
    "K73": "mild_liver_disease",  # ICD-10
    "K74.0": "mild_liver_disease",  # ICD-10
    "K74.3": "mild_liver_disease",  # ICD-10

    # Diabetes without Complications
    "250": "diabetes_without_complications",  # ICD-9
    "E10": "diabetes_without_complications",  # ICD-10
    "E11": "diabetes_without_complications",  # ICD-10

    # Diabetes with Complications
    "250.4": "diabetes_with_complications",  # ICD-9
    "E10.5": "diabetes_with_complications",  # ICD-10
    "E11.5": "diabetes_with_complications",  # ICD-10

    # Hemiplegia or Paraplegia
    "344": "hemiplegia_or_paraplegia",  # ICD-9
    "G82": "hemiplegia_or_paraplegia",  # ICD-10

    # Renal Disease
    "582": "renal_disease",  # ICD-9
    "583": "renal_disease",  # ICD-9
    "584": "renal_disease",  # ICD-9
    "N18": "renal_disease",  # ICD-10
    "N19": "renal_disease",  # ICD-10

    # Cancer (Non-Metastatic)
    "140": "cancer",  # ICD-9
    "199": "cancer",  # ICD-9
    "C00": "cancer",  # ICD-10
    "C76": "cancer",  # ICD-10

    # Moderate/Severe Liver Disease
    "572": "moderate_or_severe_liver_disease",  # ICD-9
    "K72": "moderate_or_severe_liver_disease",  # ICD-10

    # Metastatic Cancer
    "196": "metastatic_cancer",  # ICD-9
    "199": "metastatic_cancer",  # ICD-9
    "C77": "metastatic_cancer",  # ICD-10
    "C80": "metastatic_cancer",  # ICD-10

    # AIDS
    "042": "aids",  # ICD-9
    "B20": "aids",  # ICD-10
}

# CCI Weights
cci_weights = {
    "myocardial_infarction": 1,
    "congestive_heart_failure": 1,
    "peripheral_vascular_disease": 1,
    "cerebrovascular_disease": 1,
    "dementia": 1,
    "chronic_pulmonary_disease": 1,
    "rheumatic_disease": 1,
    "peptic_ulcer_disease": 1,
    "mild_liver_disease": 1,
    "diabetes_without_complications": 1,
    "diabetes_with_complications": 2,
    "hemiplegia_or_paraplegia": 2,
    "renal_disease": 2,
    "cancer": 2,
    "moderate_or_severe_liver_disease": 3,
    "metastatic_cancer": 6,
    "aids": 6,
}

In [6]:
# Ensure ICD codes are strings
diagnoses_cleaned["icd_code"] = diagnoses_cleaned["icd_code"].astype(str)

# Map ICD codes to comorbidities
diagnoses_cleaned["comorbidity"] = diagnoses_cleaned["icd_code"].map(icd_to_comorbidity)

# Group comorbidities by admission
admission_comorbidities = (
    diagnoses_cleaned.groupby("hadm_id")["comorbidity"]
    .apply(set)  # Use a set to avoid duplicate comorbidities
    .reset_index()
)

admission_comorbidities.rename(columns={"comorbidity": "comorbidities"}, inplace=True)

# Function to calculate CCI score
def calculate_cci(comorbidities, weights):
    return sum(weights.get(comorbidity, 0) for comorbidity in comorbidities)

# Apply the function to compute CCI scores
admission_comorbidities["cci_score"] = admission_comorbidities["comorbidities"].apply(
    lambda comorbidities: calculate_cci(comorbidities, cci_weights)
)

# Merge CCI scores into admissions dataset
admissions_cleaned = admissions_cleaned.merge(
    admission_comorbidities[["hadm_id", "cci_score"]], on="hadm_id", how="left"
)

# Fill missing values with 0
admissions_cleaned["cci_score"] = admissions_cleaned["cci_score"].fillna(0).astype("int")

print(admissions_cleaned["cci_score"].describe())
print(admissions_cleaned["cci_score"].value_counts())

count    546028.000000
mean          0.104535
std           0.558763
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           8.000000
Name: cci_score, dtype: float64
cci_score
0    510134
1     29895
6      3522
2      2239
7       200
3        26
8        12
Name: count, dtype: int64


In [30]:
cci_proportions = admissions_cleaned['cci_score'].value_counts(normalize=True)
print(cci_proportions)

cci_score
0    0.934263
1    0.054749
6    0.006450
2    0.004102
7    0.000366
3    0.000048
8    0.000022
Name: proportion, dtype: float64


### X2​: Length of Stay (LOS) (Numerical) — Longer stays can reflect severity of illness, which might increase readmission risk.


In [8]:
admissions_cleaned["length_of_stay"] = (
    pd.to_datetime(admissions_cleaned["dischtime"]) - pd.to_datetime(admissions_cleaned["admittime"])
).dt.days.clip(lower=0)

print(admissions_cleaned["length_of_stay"].describe())
print(admissions_cleaned["length_of_stay"].value_counts())

count    546028.000000
mean          4.221353
std           7.201081
min           0.000000
25%           1.000000
50%           2.000000
75%           5.000000
max         515.000000
Name: length_of_stay, dtype: float64
length_of_stay
0      120812
1       93497
2       75746
3       56912
4       45216
        ...  
321         1
515         1
295         1
152         1
184         1
Name: count, Length: 195, dtype: int64


### X3​: Age (Categorical) — Older age is often associated with a higher risk of readmission.


In [9]:
# Define age groups
def categorize_age(age):
    if age < 18:
        return "Youth"
    elif 18 <= age <= 35:
        return "Young"
    elif 36 <= age <= 60:
        return "Middle"
    else:
        return "Old"

# Apply categorization
admissions_cleaned["age_group"] = admissions_cleaned["age"].apply(categorize_age)

In [31]:
print(admissions_cleaned["age_group"].describe())
print(admissions_cleaned["age_group"].value_counts())
print(admissions_cleaned["age_group"].value_counts(normalize=True))

count     546038
unique         3
top          Old
freq      277130
Name: age_group, dtype: object
age_group
Old       277130
Middle    187306
Young      81602
Name: count, dtype: int64
age_group
Old       0.507529
Middle    0.343027
Young     0.149444
Name: proportion, dtype: float64


### X4: Discharge Disposition (Categorical) — Whether the patient was discharged to home, a skilled nursing facility, or another setting can influence readmission likelihood.


In [11]:
admissions_cleaned['discharge_location'].value_counts()

discharge_location
HOME                            194204
HOME HEALTH CARE                 99305
SKILLED NURSING FACILITY         52657
REHAB                            13845
DIED                             11721
CHRONIC/LONG TERM ACUTE CARE      8125
HOSPICE                           5397
AGAINST ADVICE                    3393
PSYCH FACILITY                    2965
ACUTE HOSPITAL                    2334
OTHER FACILITY                    1592
ASSISTED LIVING                    622
HEALTHCARE FACILITY                 50
Name: count, dtype: int64

In [32]:
# Copy 'discharge_location' to 'discharge_disposition'
admissions_cleaned["discharge_disposition"] = admissions_cleaned["discharge_location"]

# Add "unknown" as a new category
admissions_cleaned["discharge_disposition"] = admissions_cleaned["discharge_disposition"].cat.add_categories(["UNKNOWN"])

# Fill missing values with "unknown"
admissions_cleaned["discharge_disposition"] = admissions_cleaned["discharge_disposition"].fillna("UNKNOWN").astype("category")

# Check the distribution of discharge dispositions
print(admissions_cleaned["discharge_disposition"].describe())
print(admissions_cleaned["discharge_disposition"].value_counts())
print(admissions_cleaned["discharge_disposition"].value_counts(normalize=True))

count     546038
unique        14
top         HOME
freq      194205
Name: discharge_disposition, dtype: object
discharge_disposition
HOME                            194205
UNKNOWN                         149825
HOME HEALTH CARE                 99305
SKILLED NURSING FACILITY         52658
REHAB                            13845
DIED                             11722
CHRONIC/LONG TERM ACUTE CARE      8125
HOSPICE                           5397
AGAINST ADVICE                    3393
PSYCH FACILITY                    2965
ACUTE HOSPITAL                    2334
OTHER FACILITY                    1592
ASSISTED LIVING                    622
HEALTHCARE FACILITY                 50
Name: count, dtype: int64
discharge_disposition
HOME                            0.355662
UNKNOWN                         0.274386
HOME HEALTH CARE                0.181865
SKILLED NURSING FACILITY        0.096437
REHAB                           0.025355
DIED                            0.021467
CHRONIC/LONG TERM ACUTE CAR

### X5​: Major Diagnostic Categories (MCD) (Categorical) — Grouped rimary diagnoses into clinically meaningful categories based on affected body systems or etiology, providing a high-level summary of the patient's condition. Certain diagnoses are more prone to readmissions.


In [13]:
import re

# Define a function to map ICD codes to MDC categories
def icd_to_mdc(icd_code):
    if pd.isnull(icd_code):
        return 'Unknown'  # Handle missing values

    # Convert the ICD code to a string and clean it
    icd_code = str(icd_code).strip().upper()
    icd_code = icd_code.replace('.', '')  # Remove decimal points for consistency

    # Handle ICD-9 Codes (numeric)
    if icd_code.isdigit():
        icd_num = int(icd_code)  # Convert to integer for comparison
        if 1 <= icd_num <= 139:
            return 'Infectious and Parasitic Diseases'
        elif 140 <= icd_num <= 239:
            return 'Neoplasms'
        elif 240 <= icd_num <= 279:
            return 'Endocrine, Nutritional, and Metabolic Diseases'
        elif 280 <= icd_num <= 289:
            return 'Diseases of the Blood and Blood-forming Organs'
        elif 290 <= icd_num <= 319:
            return 'Mental Disorders'
        elif 320 <= icd_num <= 389:
            return 'Diseases of the Nervous System and Sense Organs'
        elif 390 <= icd_num <= 459:
            return 'Diseases of the Circulatory System'
        elif 460 <= icd_num <= 519:
            return 'Diseases of the Respiratory System'
        elif 520 <= icd_num <= 579:
            return 'Diseases of the Digestive System'
        elif 580 <= icd_num <= 629:
            return 'Diseases of the Genitourinary System'
        elif 630 <= icd_num <= 679:
            return 'Complications of Pregnancy, Childbirth, and the Puerperium'
        elif 680 <= icd_num <= 709:
            return 'Diseases of the Skin and Subcutaneous Tissue'
        elif 710 <= icd_num <= 739:
            return 'Diseases of the Musculoskeletal System and Connective Tissue'
        elif 740 <= icd_num <= 759:
            return 'Congenital Anomalies'
        elif 760 <= icd_num <= 779:
            return 'Certain Conditions Originating in the Perinatal Period'
        elif 780 <= icd_num <= 799:
            return 'Symptoms, Signs, and Ill-defined Conditions'
        elif 800 <= icd_num <= 999:
            return 'Injury and Poisoning'
        else:
            return 'Other'

    # Handle ICD-10 Codes (alphanumeric)
    elif re.match(r'^[A-Z]', icd_code):  # Starts with a letter
        icd_char = icd_code[0]
        if icd_char in ['A', 'B']:
            return 'Certain Infectious and Parasitic Diseases'
        elif icd_char == 'C' or (icd_char == 'D' and len(icd_code) > 1 and icd_code[1].isdigit() and int(icd_code[1]) <= 4):
            return 'Neoplasms'
        elif icd_char == 'D':
            return 'Diseases of the Blood and Blood-forming Organs and Immune Disorders'
        elif icd_char == 'E':
            return 'Endocrine, Nutritional, and Metabolic Diseases'
        elif icd_char == 'F':
            return 'Mental and Behavioural Disorders'
        elif icd_char == 'G':
            return 'Diseases of the Nervous System'
        elif icd_char == 'H':
            if icd_code.startswith('H0') or icd_code.startswith('H1'):
                return 'Diseases of the Eye and Adnexa'
            else:
                return 'Diseases of the Ear and Mastoid Process'
        elif icd_char == 'I':
            return 'Diseases of the Circulatory System'
        elif icd_char == 'J':
            return 'Diseases of the Respiratory System'
        elif icd_char == 'K':
            return 'Diseases of the Digestive System'
        elif icd_char == 'L':
            return 'Diseases of the Skin and Subcutaneous Tissue'
        elif icd_char == 'M':
            return 'Diseases of the Musculoskeletal System and Connective Tissue'
        elif icd_char == 'N':
            return 'Diseases of the Genitourinary System'
        elif icd_char == 'O':
            return 'Pregnancy, Childbirth, and the Puerperium'
        elif icd_char == 'P':
            return 'Certain Conditions Originating in the Perinatal Period'
        elif icd_char == 'Q':
            return 'Congenital Malformations and Chromosomal Abnormalities'
        elif icd_char == 'R':
            return 'Symptoms, Signs, and Abnormal Clinical and Laboratory Findings'
        elif icd_char in ['S', 'T']:
            return 'Injury, Poisoning, and Certain Other Consequences of External Causes'
        elif icd_char in ['V', 'W', 'X', 'Y']:
            return 'External Causes of Morbidity and Mortality'
        elif icd_char == 'Z':
            return 'Factors Influencing Health Status and Contact with Health Services'
        else:
            return 'Other'

    # If the ICD code does not match any known pattern
    return 'Other'

# Extract primary diagnoses and map ICD codes to MDC categories
primary_diagnoses = diagnoses_cleaned[diagnoses_cleaned["seq_num"] == 1][["hadm_id", "icd_code"]]
primary_diagnoses["mdc_category"] = primary_diagnoses["icd_code"].apply(icd_to_mdc)

# Merge MDC categories with admissions data
admissions_cleaned = admissions_cleaned.merge(primary_diagnoses[["hadm_id", "mdc_category"]], on="hadm_id", how="left")

# Convert the mdc_category column to a categorical type
admissions_cleaned["mdc_category"] = admissions_cleaned["mdc_category"].astype("category")

# Add "Unknown" as a category if not already present
if 'Unknown' not in admissions_cleaned["mdc_category"].cat.categories:
    admissions_cleaned["mdc_category"] = admissions_cleaned["mdc_category"].cat.add_categories(["Unknown"])

# Fill missing values for mdc_category with "Unknown"
admissions_cleaned["mdc_category"] = admissions_cleaned["mdc_category"].fillna("Unknown")

In [33]:
print(admissions_cleaned["mdc_category"].value_counts())
print(admissions_cleaned["mdc_category"].value_counts(normalize=True))
print(admissions_cleaned["mdc_category"].describe())

mdc_category
Other                                                                   266190
Diseases of the Circulatory System                                       45956
Diseases of the Digestive System                                         31424
Injury, Poisoning, and Certain Other Consequences of External Causes     29989
Mental and Behavioural Disorders                                         19295
Symptoms, Signs, and Abnormal Clinical and Laboratory Findings           18459
Neoplasms                                                                17886
Diseases of the Respiratory System                                       16944
Diseases of the Musculoskeletal System and Connective Tissue             12789
Certain Infectious and Parasitic Diseases                                12472
Pregnancy, Childbirth, and the Puerperium                                11870
Diseases of the Genitourinary System                                     10796
Endocrine, Nutritional, and Metabolic D

### X6: Use of Mechanical Ventilation (Binary/Categorical) — Patients who required ventilation may have a higher risk of complications post-discharge.


In [37]:
# Define mechanical ventilation procedure codes
ventilation_codes = [
    "96.70", "96.71", "96.72",  # ICD-9 codes
    "5A1935Z", "5A1945Z", "5A1955Z"  # ICD-10 codes
]

# Flag admissions with mechanical ventilation
procedures_cleaned["mechanical_ventilation"] = procedures_cleaned["icd_code"].isin(ventilation_codes).astype(int)

# Group by hadm_id to determine if ventilation occurred for each admission
ventilation_status = procedures_cleaned.groupby("hadm_id")["mechanical_ventilation"].max().reset_index()

# Rename column for clarity
ventilation_status.rename(columns={"mechanical_ventilation": "ventilation_used"}, inplace=True)

# Merge ventilation status with the admissions dataset
admissions_cleaned = admissions_cleaned.merge(ventilation_status, on="hadm_id", how="left")

# Fill missing values with 0 (no ventilation)
admissions_cleaned["ventilation_used"] = admissions_cleaned["ventilation_used"].fillna(0).astype("int")

# Verify the distribution of ventilation usage
print(admissions_cleaned["ventilation_used"].value_counts())
print(admissions_cleaned["ventilation_used"].value_counts(normalize=True))


ventilation_used
0    535561
1     10477
Name: count, dtype: int64
ventilation_used
0    0.980813
1    0.019187
Name: proportion, dtype: float64


### X7​: Medication Count at Discharge (Numerical) — A higher number of medications may indicate complex health issues.


In [16]:
# Filter prescriptions that overlap with the discharge time
prescriptions_cleaned["active_at_discharge"] = (
    (prescriptions_cleaned["stoptime"].isnull()) |  # Ongoing medication
    (prescriptions_cleaned["stoptime"] >= prescriptions_cleaned["starttime"])
)

# Ensure only relevant prescriptions are retained
active_prescriptions = prescriptions_cleaned[prescriptions_cleaned["active_at_discharge"]]

# Count unique medications (drug names) per hospital admission
medication_counts = (
    active_prescriptions.groupby("hadm_id")["drug"]
    .nunique()  # Count unique drugs
    .reset_index()
)

# Rename column for clarity
medication_counts.rename(columns={"drug": "medication_count"}, inplace=True)

# Merge medication counts with admissions data
admissions_cleaned = admissions_cleaned.merge(medication_counts, on="hadm_id", how="left")

# Fill missing values with 0 (no medications)
admissions_cleaned["medication_count"] = admissions_cleaned["medication_count"].fillna(0).astype("int")  # Default for counts

# Verify the distribution of medication counts
print(admissions_cleaned["medication_count"].describe())
print(admissions_cleaned["medication_count"].value_counts())

count    546038.000000
mean         20.105134
std          15.878812
min           0.000000
25%          10.000000
50%          18.000000
75%          27.000000
max         195.000000
Name: medication_count, dtype: float64
medication_count
0      82731
16     19485
15     19397
14     19292
17     19149
       ...  
181        1
192        1
175        1
149        1
157        1
Name: count, Length: 168, dtype: int64


### X8​: Serum Creatinine (Numerical) — Reflects kidney function; abnormalities can suggest chronic conditions that influence readmission risk.


In [17]:
# Define itemids for serum creatinine (based on `d_labitems.csv`)
serum_creatinine_itemids = [50912]  # Example itemid for serum creatinine

# Filter labevents for serum creatinine tests
serum_creatinine = labevents_cleaned[
    labevents_cleaned["itemid"].isin(serum_creatinine_itemids)
]

# Merge serum creatinine with discharge time to filter valid values
serum_creatinine = serum_creatinine.merge(
    admissions_cleaned[["hadm_id", "dischtime"]],
    on="hadm_id",
    how="inner"
)

# Keep only lab values recorded before discharge
serum_creatinine = serum_creatinine[
    serum_creatinine["charttime"] <= serum_creatinine["dischtime"]
]

# Sort by hadm_id and charttime, keeping the most recent value
serum_creatinine = serum_creatinine.sort_values(by=["hadm_id", "charttime"], ascending=[True, False])
most_recent_creatinine = serum_creatinine.groupby("hadm_id").first().reset_index()

# Select relevant columns
most_recent_creatinine = most_recent_creatinine[["hadm_id", "valuenum"]]
most_recent_creatinine.rename(columns={"valuenum": "serum_creatinine"}, inplace=True)

# Merge serum creatinine values into admissions data
admissions_cleaned = admissions_cleaned.merge(
    most_recent_creatinine, on="hadm_id", how="left"
)

# Verify the distribution of serum creatinine values
print(admissions_cleaned["serum_creatinine"].describe())

count    415674.000000
mean          1.209527
std           1.311566
min           0.000000
25%           0.700000
50%           0.900000
75%           1.200000
max          29.500000
Name: serum_creatinine, dtype: float64


### X9​: Prior Admissions within 1 Year (Categorical) —  Patients with frequent hospitalizations may have chronic conditions or complications that increase the likelihood of readmission.

In [18]:
# Sort by patient ID and admission time
admissions_cleaned = admissions_cleaned.sort_values(by=["subject_id", "admittime"]).reset_index(drop=True)

# Calculate the time difference from the previous admission
admissions_cleaned["time_since_last_admission"] = (
    admissions_cleaned.groupby("subject_id")["admittime"].diff().dt.days
)

# Count prior admissions within 1 year
admissions_cleaned["prior_admissions"] = (
    admissions_cleaned.groupby("subject_id")["time_since_last_admission"]
    .apply(lambda x: x.between(1, 365).cumsum())
    .reset_index(drop=True)  # Aligns the grouped result with the original DataFrame
)

# Fill missing values with 0 (no prior admissions)
admissions_cleaned["prior_admissions"] = admissions_cleaned["prior_admissions"].fillna(0).astype("int")  # Default for counts

# Define admission frequency groups
def categorize_admission_freq(age):
    if age == 0:
        return "No prior admissions (0 admissions)" # No prior admissions
    elif age <= 2:
        return "Low frequency (1-2 admissions)" # 1-2 admissions
    elif age <= 4:
        return "Moderate frequency (3-4 admissions)" # 3-4 admissions
    else:
        return "High frequency (5 or more admissions)" # 5 or more admissions

# Apply categorization
admissions_cleaned["prior_admissions_1yr"] = admissions_cleaned["prior_admissions"].apply(categorize_admission_freq)

# Verify the distribution of prior admissions within 1 year
print(admissions_cleaned["prior_admissions_1yr"].describe())
print(admissions_cleaned["prior_admissions_1yr"].value_counts())


count                                 546038
unique                                     4
top       No prior admissions (0 admissions)
freq                                  275756
Name: prior_admissions_1yr, dtype: object
prior_admissions_1yr
No prior admissions (0 admissions)       275756
Low frequency (1-2 admissions)           137810
High frequency (5 or more admissions)     84453
Moderate frequency (3-4 admissions)       48019
Name: count, dtype: int64


In [39]:
print(admissions_cleaned["prior_admissions_1yr"].value_counts(normalize=True))

prior_admissions_1yr
No prior admissions (0 admissions)       0.505012
Low frequency (1-2 admissions)           0.252382
High frequency (5 or more admissions)    0.154665
Moderate frequency (3-4 admissions)      0.087941
Name: proportion, dtype: float64


### X10: Number of Lab Tests Ordered (Numerical) - A higher number of lab tests may indicate greater severity of illness or a more complicated hospital course.

In [19]:
# Count the number of lab tests per admission
lab_tests_count = labevents_cleaned.groupby("hadm_id")["itemid"].count().reset_index()
lab_tests_count.rename(columns={"itemid": "num_lab_tests"}, inplace=True)

# Merge lab tests count into admissions data
admissions_cleaned = admissions_cleaned.merge(lab_tests_count, on="hadm_id", how="left")

# Fill missing values with 0 (no lab tests)
admissions_cleaned["num_lab_tests"] = admissions_cleaned["num_lab_tests"].fillna(0).astype("int")  # Default for counts

# Verify the distribution of lab tests
print(admissions_cleaned["num_lab_tests"].describe())



count    546038.000000
mean        154.946095
std         336.576722
min           0.000000
25%          20.000000
50%          66.000000
75%         165.000000
max       21655.000000
Name: num_lab_tests, dtype: float64


### X11: Diagnosis Count (Numerical) - A higher number of diagnoses during an admission might indicate comorbidities or complications, which could increase readmission risk.

In [20]:
# Count the number of diagnoses per admission
diagnosis_count = diagnoses_cleaned.groupby("hadm_id")["icd_code"].count().reset_index()
diagnosis_count.rename(columns={"icd_code": "num_diagnoses"}, inplace=True)

# Merge diagnosis count into admissions data
admissions_cleaned = admissions_cleaned.merge(diagnosis_count, on="hadm_id", how="left")

# Fill missing values with 0 (no diagnoses)
admissions_cleaned["num_diagnoses"] = admissions_cleaned["num_diagnoses"].fillna(0).astype("int")  # Default for counts

# Verify the distribution of diagnoses
print(admissions_cleaned["num_diagnoses"].describe())


count    546038.000000
mean         11.656216
std           7.631517
min           0.000000
25%           6.000000
50%          10.000000
75%          16.000000
max          57.000000
Name: num_diagnoses, dtype: float64


### Y1: 30-day Readmission (Binary/Categorical) - The health outcome, the patients that are readmitted to the hospital within 30-days.

In [21]:
# Sort by patient ID and admission time
admissions_cleaned = admissions_cleaned.sort_values(by=["subject_id", "admittime"]).reset_index(drop=True)

# Calculate time difference to the next admission for the same patient
admissions_cleaned["time_to_next_admission"] = admissions_cleaned.groupby("subject_id")["admittime"].shift(-1) - admissions_cleaned["dischtime"]

# Convert time difference to days
admissions_cleaned["time_to_next_admission_days"] = admissions_cleaned["time_to_next_admission"].dt.days

# Determine 30-day readmission (1 if time_to_next_admission_days <= 30, else 0)
admissions_cleaned["readmitted_30_days"] = (admissions_cleaned["time_to_next_admission_days"] <= 30).astype("int")

# Fill NaN values for the last admission of each patient with 0 (no readmission)
admissions_cleaned["readmitted_30_days"] = admissions_cleaned["readmitted_30_days"].fillna(0)

# Verify the result
print(admissions_cleaned[["subject_id", "hadm_id", "admittime", "dischtime", "time_to_next_admission_days", "readmitted_30_days"]].head(10))

   subject_id   hadm_id           admittime           dischtime  \
0    10000032  22595853 2180-05-06 22:23:00 2180-05-07 17:15:00   
1    10000032  22841357 2180-06-26 18:27:00 2180-06-27 18:49:00   
2    10000032  29079034 2180-07-23 12:35:00 2180-07-25 17:55:00   
3    10000032  25742920 2180-08-05 23:44:00 2180-08-07 17:50:00   
4    10000068  25022803 2160-03-03 23:16:00 2160-03-04 06:26:00   
5    10000084  23052089 2160-11-21 01:56:00 2160-11-25 14:52:00   
6    10000084  29888819 2160-12-28 05:11:00 2160-12-28 16:07:00   
7    10000108  27250926 2163-09-27 23:17:00 2163-09-28 09:04:00   
8    10000117  22927623 2181-11-15 02:05:00 2181-11-15 14:52:00   
9    10000117  27988844 2183-09-18 18:10:00 2183-09-21 16:30:00   

   time_to_next_admission_days  readmitted_30_days  
0                         50.0                   0  
1                         25.0                   1  
2                         11.0                   1  
3                          NaN                   0

In [22]:
admissions_cleaned['readmitted_30_days'].value_counts()

readmitted_30_days
0    434993
1    111045
Name: count, dtype: int64

## Combine predictors into a single dataset for modeling

In [23]:
admissions_cleaned.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag', 'gender',
       'anchor_age', 'anchor_year', 'anchor_year_group', 'dod',
       'admittime_year', 'age', 'cci_score', 'length_of_stay', 'age_group',
       'discharge_disposition', 'mdc_category', 'ventilation_used',
       'medication_count', 'serum_creatinine', 'time_since_last_admission',
       'prior_admissions', 'prior_admissions_1yr', 'num_lab_tests',
       'num_diagnoses', 'time_to_next_admission',
       'time_to_next_admission_days', 'readmitted_30_days'],
      dtype='object')

In [27]:
# Final dataset for modeling
predictors = admissions_cleaned[[
    "subject_id", "hadm_id", "cci_score", "length_of_stay", "age_group", "discharge_disposition",
    "mdc_category", "ventilation_used", "medication_count",
    "serum_creatinine", "prior_admissions_1yr", "num_lab_tests", "num_diagnoses", "readmitted_30_days"
]]

print(predictors.describe())

         subject_id       hadm_id      cci_score  length_of_stay  \
count  5.460380e+05  5.460380e+05  546038.000000   546038.000000   
mean   1.501117e+07  2.500100e+07       0.104537        4.221307   
std    2.877700e+06  2.888712e+06       0.558764        7.201032   
min    1.000003e+07  2.000002e+07       0.000000        0.000000   
25%    1.252380e+07  2.249663e+07       0.000000        1.000000   
50%    1.501954e+07  2.500382e+07       0.000000        2.000000   
75%    1.750401e+07  2.750282e+07       0.000000        5.000000   
max    1.999999e+07  2.999994e+07       8.000000      515.000000   

       ventilation_used  medication_count  serum_creatinine  num_lab_tests  \
count     546038.000000     546038.000000     415674.000000  546038.000000   
mean           0.019187         20.105134          1.209527     154.946095   
std            0.137183         15.878812          1.311566     336.576722   
min            0.000000          0.000000          0.000000       0.000000 

In [28]:
# Ensure correct data types for predictors
predictors = predictors.astype({
    "subject_id": "int",
    "hadm_id": "int",
    "cci_score": "category",
    "length_of_stay": "int",
    "age_group": "category",
    "discharge_disposition": "category",
    "mdc_category": "category",
    "ventilation_used": "category",
    "medication_count": "int",
    "serum_creatinine": "float",
    "prior_admissions_1yr": "category",
    "num_lab_tests": "int",
    "num_diagnoses": "int",
    "readmitted_30_days": "int"
})


In [29]:
# Save the dataset for modeling
predictors.to_csv(f"{data_dir}/predictors_dataset.csv", index=False)
print("Predictors dataset saved.")

Predictors dataset saved.
