Reminder: clear all output before saving.

# Table of Contents
1. Create chartevents_trust for trust.ipynb
2. Create code_status for outcomes_ml.ipynb
3. Create icustay_detail to be used as input for other tables
4. Create demographics for race_mimic_aggressive.ipynb and mistrust_mimic_aggressive.ipynb
5. Create discharge for race_mimic_aggressive.ipynb and mistrust_mimic_aggressive.ipynb
6. Create mortality_outcome and mortality_aggressive for outcomes_ml.ipynb and the _aggressive notebooks
7. Create noteevents_trust for trust.ipynb

In [None]:
import gc
import numpy as np
import polars as pl

In [None]:
DATA_LOCATION = 'Mimic3_Data'

In [None]:
chartevents = pl.scan_csv(f'{DATA_LOCATION}/CHARTEVENTS.csv', schema_overrides={'VALUE': pl.String()}, infer_schema_length=20000, ignore_errors=True)
chartevents = chartevents.select(['HADM_ID', 'ITEMID', 'VALUE']).unique()
d_items = pl.scan_csv(f'{DATA_LOCATION}/D_ITEMS.csv', infer_schema_length=20000, ignore_errors=True)
d_items = d_items.select(['ITEMID', 'LABEL']).unique()


In [None]:
# Read interpersonal interaction variables from chartevents

relevant_labels = '''
    Family Communication
    Follows Commands
    Education Barrier
    Education Learner
    Education Method
    Education Readiness
    Education Topic #1
    Education Topic #2
    Pain
    Pain Level
    Pain Level (Rest)
    Pain Assess Method
    Restraint
    Restraint Type
    Restraint (Non-violent)
    Restraint Ordered (Non-violent)
    Restraint Location
    Reason For Restraint
    Spiritual Support
    Support Systems
    State
    Behavior
    Behavioral State
    Reason For Restraint
    Stress
    Safety
    Safety Measures_U_1
    Family
    Patient/Family Informed
    Pt./Family Informed
    Health Care Proxy
    BATH                
    bath                
    Bath                
    Bed Bath            
    bed bath            
    bed bath            
    Bedbath             
    CHG Bath            
    Skin Care           
    Judgement           
    Family Meeting held 
    Emotional / physical / sexual harm by partner or close relation
    Verbal Response
    Side Rails
    Orientation
    RSBI Deferred
    Richmond-RAS Scale
    Riker-SAS Scale
    Status and Comfort
    Teaching directed toward
    Consults
    Social work consult
    Sitter
    security
    safety
    headache
    hairwashed
    observer
'''

labels_only = []
for rl in relevant_labels.split('\n'):
    rl = rl.strip()
    if len(rl):
        labels_only.append(rl)

In [None]:
# creates chartevents_trust for trust.ipynb
d_items_trust = d_items.filter(pl.col('LABEL').str.contains_any(labels_only, ascii_case_insensitive=True))
chartevents_trust = chartevents.join(d_items_trust, left_on='ITEMID', right_on='ITEMID', how='inner')
chartevents_trust = chartevents_trust.select(['HADM_ID', 'LABEL', 'VALUE'])
chartevents_trust = chartevents_trust.collect()
display(chartevents_trust.head())
chartevents_trust.write_parquet(f'{DATA_LOCATION}/chartevents_trust.parquet')
del chartevents_trust
gc.collect()

In [None]:
# creates code_status_df
d_items_race = d_items.filter(pl.col('LABEL') == 'Code Status')
code_status = chartevents.join(d_items_race, on='ITEMID', how='inner')
code_status_df = code_status.collect()
display(code_status_df.head())
code_status_df.write_parquet(f'{DATA_LOCATION}/code_status.parquet')
del code_status_df
gc.collect()

In [None]:
# creates icustay_detail
icustays = pl.scan_csv(f'{DATA_LOCATION}/ICUSTAYS.csv').select(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'INTIME', 'OUTTIME'])
admissions = pl.scan_csv(f'{DATA_LOCATION}/ADMISSIONS.csv').select(['HADM_ID', 'ETHNICITY', 'HAS_CHARTEVENTS_DATA'])
patients = pl.scan_csv(f'{DATA_LOCATION}/PATIENTS.csv').select(['SUBJECT_ID', 'GENDER', 'DOB'])

icustay_detail = (
    icustays
    .join(admissions, left_on='HADM_ID', right_on='HADM_ID', how='inner')
    .join(patients, left_on='SUBJECT_ID', right_on='SUBJECT_ID', how='inner')
    .filter(pl.col('HAS_CHARTEVENTS_DATA') == 1)
    .select([
        'SUBJECT_ID',
        'HADM_ID',
        'ICUSTAY_ID',
        'GENDER',
        'ETHNICITY',
        'INTIME',
        'OUTTIME',
        'DOB'
    ])
)
icustay_detail = icustay_detail.with_columns(pl.col('INTIME').str.to_date('%Y-%m-%d %H:%M:%S'))
icustay_detail = icustay_detail.with_columns(pl.col('OUTTIME').str.to_date('%Y-%m-%d %H:%M:%S'))
icustay_detail = icustay_detail.with_columns(pl.col('DOB').str.to_date('%Y-%m-%d %H:%M:%S'))
icustay_detail_df = icustay_detail.collect()
icustay_detail_df = icustay_detail_df.with_columns(
    ((pl.col('INTIME').cast(pl.Datetime) - pl.col('DOB').cast(pl.Datetime)).dt.total_days() / 365.25)
    .floor()
    .clip(0, 90) # ages above 89 are obfuscated, so just keep them at 90
    .alias('AGE')
)
display(icustay_detail_df.head())
icustay_detail_df.write_parquet(f'{DATA_LOCATION}/icustay_detail.parquet')
del icustay_detail_df
gc.collect()

In [None]:
# creates demographics_df
demographics = pl.scan_parquet(f'{DATA_LOCATION}/icustay_detail.parquet').select(['SUBJECT_ID', 'HADM_ID', 'GENDER', 'ETHNICITY', 'AGE']).unique()
demographics = demographics.collect()
display(demographics.head())
demographics.write_parquet(f'{DATA_LOCATION}/demographics.parquet')
del demographics
gc.collect()

In [None]:
# creates discharge_df
discharge = pl.scan_csv(f'{DATA_LOCATION}/ADMISSIONS.csv').select(['SUBJECT_ID', 'HADM_ID', 'ETHNICITY', 'DISCHARGE_LOCATION', 'ADMITTIME', 'DISCHTIME']).unique()
discharge = discharge.collect()
display(discharge.head())
discharge.write_parquet(f'{DATA_LOCATION}/discharge.parquet')
del discharge
gc.collect()

In [None]:
# creates mortality_outcomes for outcomes_ml.ipynb
mortality_outcomes = pl.scan_csv(f'{DATA_LOCATION}/ADMISSIONS.csv').select(['HADM_ID', 'HOSPITAL_EXPIRE_FLAG']).unique()
mortality_outcomes = mortality_outcomes.collect()
display(mortality_outcomes.head())
mortality_outcomes.write_parquet(f'{DATA_LOCATION}/mortality_outcomes.parquet')
del mortality_outcomes
gc.collect()

# creates mortality_aggressive for other notebooks
mortality_aggressive = pl.scan_csv(f'{DATA_LOCATION}/PATIENTS.csv').select(['SUBJECT_ID', 'DOD']).unique()
mortality_aggressive = mortality_aggressive.collect()
display(mortality_aggressive.head())
mortality_aggressive.write_parquet(f'{DATA_LOCATION}/mortality_aggressive.parquet')
del mortality_aggressive
gc.collect()

In [None]:
# creates noteevents_trust for trust.ipynb
noteevents_trust = pl.scan_csv(f'{DATA_LOCATION}/NOTEEVENTS.csv', schema_overrides={'TEXT': pl.String()}, infer_schema_length=20000, ignore_errors=True)
noteevents_trust = noteevents_trust.select(['HADM_ID', 'CATEGORY', 'TEXT', 'CHARTDATE', 'CHARTTIME', 'ISERROR'])
noteevents_trust = noteevents_trust.filter(pl.col('ISERROR').is_null())
noteevents_trust = noteevents_trust.collect()
display(noteevents_trust.head())
noteevents_trust.write_parquet(f'{DATA_LOCATION}/noteevents_trust.parquet')
del noteevents_trust
gc.collect()