In [2]:
import pandas as pd 
import psycopg2
import getpass


connection = psycopg2.connect(
    user = 'postgres',
    database="mimic",
    password=getpass.getpass("Enter postgres password"),
    #password="EMm=N*]9}4yE", 
    host="127.0.0.1", 
    port = "5433",
    options=f'-c search_path=mimiciii')

##############################################################################
####################### CONNECT TO MIMIC #####################################

#paste in input the directory of MIMIC
# C:\Users\Maria\Desktop\Work\data\MIMICIII\mimic3.db

#MIMIC_PATH = pathlib.PureWindowsPath(input('Please specify input folder of data: '))


#connection = sqlite3.connect(MIMIC_PATH)


# Create our test query
test_query = """
SELECT subject_id, hadm_id, admittime, dischtime, admission_type, diagnosis
FROM admissions
"""

# Run the query and assign the results to a variable
test = pd.read_sql_query(test_query,connection)

print("TEST:", test.head())

if isinstance(test, pd.DataFrame):
    print("Good to go!")
else:
    print("This is not a dataframe! something is wrong, check before you continue with the queries") 

TEST:    subject_id  hadm_id           admittime           dischtime admission_type  \
0          22   165315 2196-04-09 12:26:00 2196-04-10 15:54:00      EMERGENCY   
1          23   152223 2153-09-03 07:15:00 2153-09-08 19:10:00       ELECTIVE   
2          23   124321 2157-10-18 19:34:00 2157-10-25 14:00:00      EMERGENCY   
3          24   161859 2139-06-06 16:14:00 2139-06-09 12:48:00      EMERGENCY   
4          25   129635 2160-11-02 02:06:00 2160-11-05 14:55:00      EMERGENCY   

                                           diagnosis  
0                            BENZODIAZEPINE OVERDOSE  
1  CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...  
2                                         BRAIN MASS  
3                     INTERIOR MYOCARDIAL INFARCTION  
4                            ACUTE CORONARY SYNDROME  
Good to go!


## Define the mortality cohort

In [3]:
######################################################################################
######################################################################################

#  COHORT SELECTION

######################################################################################
######################## query admissions, patients ##################################


# define theta_age, theta_history, theta_length_of_stay

# For demographic information, patient’s age, gender, marital status, ethnicity, and insurance information are considered. 
# Only adult patients are enrolled in this study. 
# Hence, age was split into 5 groups (18, 25), (25, 45), (45, 65), (65, 89), (89,). 
# For admission-related information, admission type is included as features.
# Choose ICU patients admitted for the first time in ICU, exclude later ICU admissions
# define desired duration of stay, theta_icu_stay_min and theta_icu_stay_max in hours
#  
# theta_age = input("Define the age limit for the cohort. For adults choose 18")
# theta_history = input("Define how far back to look in the patients history past. Input number in years")
# theta_length_of_stay = input("Define hospital admissions whose length of stay is less than x day to be excluded")

icu_demographics = pd.read_sql("""

SELECT 
icustays.subject_id, 
icustays.hadm_id, 
icustays.icustay_id, 
patients.DOB, 
patients.DOD, 
patients.gender, 
admissions.ethnicity, 
admissions.diagnosis, 
icustays.intime 
, cast(icustays.outtime as date) - cast(icustays.intime as date)  as icu_length_of_stay
, (cast(icustays.INTIME as date) - cast(patients.DOB as date))/ 364.242 as age
, (cast(admissions.deathtime as date) - cast(icustays.intime as date)) as death_after_icu_hours
, admissions.hospital_expire_flag , icustays.outtime 
, RANK() OVER (PARTITION BY icustays.subject_id ORDER BY icustays.intime) AS icustay_id_order 
, CASE when admissions.deathtime between icustays.intime and icustays.outtime THEN 1 ELSE 0 END AS mort_icu
, CASE when admissions.deathtime between admissions.admittime and admissions.dischtime THEN 1 ELSE 0 END AS mort_hosp


FROM icustays
INNER JOIN patients 
  ON icustays.subject_id = patients.subject_id
INNER JOIN admissions
    ON admissions.subject_id = icustays.subject_id
    AND admissions.hadm_id = icustays.hadm_id

LIMIT 1000


""", con= connection)


In [4]:
icu_demographics

Unnamed: 0,subject_id,hadm_id,icustay_id,dob,dod,gender,ethnicity,diagnosis,intime,icu_length_of_stay,age,death_after_icu_hours,hospital_expire_flag,outtime,icustay_id_order,mort_icu,mort_hosp
0,2,163353,243653,2138-07-17,NaT,M,ASIAN,NEWBORN,2138-07-17 21:20:07,0.0,0.000000,,0,2138-07-17 23:32:21,1,0,0
1,3,145834,211552,2025-04-11,2102-06-14,M,WHITE,HYPOTENSION,2101-10-20 19:10:11,6.0,76.734698,,0,2101-10-26 20:43:09,1,0,0
2,4,185777,294638,2143-05-12,NaT,F,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-16 00:29:31,1.0,47.976345,,0,2191-03-17 16:46:31,1,0,0
3,5,178980,214757,2103-02-02,NaT,M,ASIAN,NEWBORN,2103-02-02 06:04:24,0.0,0.000000,,0,2103-02-02 08:06:00,1,0,0
4,6,107064,228232,2109-06-21,NaT,F,WHITE,CHRONIC RENAL FAILURE/SDA,2175-05-30 21:30:54,4.0,66.120876,,0,2175-06-03 13:39:54,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,782,163679,214032,2095-04-08,2179-07-19,M,WHITE,UPPER GASTROINTESTINAL BLEED,2176-03-04 14:58:07,1.0,81.127382,,0,2176-03-05 12:49:44,2,0,0
996,783,111007,238510,2100-09-09,NaT,F,WHITE,NEWBORN,2100-09-09 13:31:08,0.0,0.000000,,0,2100-09-09 15:00:30,1,0,0
997,784,187825,206987,2131-08-01,NaT,F,WHITE,ABDOMINAL PAIN,2200-06-04 18:34:00,27.0,69.031029,,0,2200-07-01 17:18:00,1,0,0
998,785,192508,228499,2062-04-30,NaT,F,WHITE,ABDOMINAL AORTIC ANEURYSM\ABDOMINAL AORTIC ANE...,2144-02-25 10:09:53,8.0,82.047101,,0,2144-03-04 13:30:27,1,0,0


In [5]:
print("Number of patients hospitalized: ", len(set((icu_demographics.subject_id))))


Number of patients hospitalized:  739


In [6]:
icu_demographics.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,dob,dod,gender,ethnicity,diagnosis,intime,icu_length_of_stay,age,death_after_icu_hours,hospital_expire_flag,outtime,icustay_id_order,mort_icu,mort_hosp
0,2,163353,243653,2138-07-17,NaT,M,ASIAN,NEWBORN,2138-07-17 21:20:07,0.0,0.0,,0,2138-07-17 23:32:21,1,0,0
1,3,145834,211552,2025-04-11,2102-06-14,M,WHITE,HYPOTENSION,2101-10-20 19:10:11,6.0,76.734698,,0,2101-10-26 20:43:09,1,0,0
2,4,185777,294638,2143-05-12,NaT,F,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-16 00:29:31,1.0,47.976345,,0,2191-03-17 16:46:31,1,0,0
3,5,178980,214757,2103-02-02,NaT,M,ASIAN,NEWBORN,2103-02-02 06:04:24,0.0,0.0,,0,2103-02-02 08:06:00,1,0,0
4,6,107064,228232,2109-06-21,NaT,F,WHITE,CHRONIC RENAL FAILURE/SDA,2175-05-30 21:30:54,4.0,66.120876,,0,2175-06-03 13:39:54,1,0,0


In [7]:
# include patients only on the 1st icu stay, older than 18 and with a length of stay from 1 to 10 days 

icu_demographics_filter = icu_demographics[(icu_demographics.icustay_id_order == 1) & (icu_demographics.age >= 18 )]
print("Number of patients hospitalized: ", len(set((icu_demographics_filter.subject_id))))

icu_demographics_filter = icu_demographics_filter[(icu_demographics_filter.icu_length_of_stay >= 0.5) & (icu_demographics_filter.icu_length_of_stay <= 10)]
print("Number of patients hospitalized: ", len(set((icu_demographics_filter.subject_id))))


Number of patients hospitalized:  571
Number of patients hospitalized:  512


In [8]:
icu_demographics_filter = icu_demographics_filter.drop(["icustay_id_order"], axis=1)

In [9]:
print("Number of patients with in hospital mortality: ", icu_demographics_filter.mort_hosp.value_counts())
print("Number of patients with in icu mortality: ", icu_demographics_filter.mort_icu.value_counts())

Number of patients with in hospital mortality:  0    468
1     44
Name: mort_hosp, dtype: int64
Number of patients with in icu mortality:  0    483
1     29
Name: mort_icu, dtype: int64


icu_demographics_filter.hadm_id.value_counts().sort_values()

In [10]:
subject_id_set = set(icu_demographics_filter['subject_id'])
hadm_id_set = set(icu_demographics_filter['hadm_id'])

In [11]:
# put labels 

import pandas as pd, numpy as np, datetime as dt
def str2time(val):
    try:
        return dt.datetime.strptime(val, '%Y-%m-%d %H:%M:%S')
    except:
        return pd.NaT

# filter patients with in hospital mortality 
# convert dates to datetime format
#icu_demographics_filter['intime'] = icu_demographics_filter['intime'].apply(str2time)
#icu_demographics_filter['outtime'] = icu_demographics_filter['outtime'].apply(str2time)



In [12]:
###

## Labs and vitals 

In [13]:
# From MIMIC extract paper
def get_variable_mapping(mimic_mapping_filename):
    # Read in the second level mapping of the itemids
    var_map = pd.read_csv(mimic_mapping_filename, index_col=None)
    var_map = var_map.loc[(var_map['LEVEL2'] != '') & (var_map['COUNT']>0) & (var_map['STATUS'] == 'ready')]
    var_map['ITEMID'] = var_map['ITEMID'].astype(int)
    # renaming to match the mimic tables
    var_map.rename(columns={'ITEMID': 'itemid'}, inplace=True)
    var_map = var_map[['LEVEL2', 'itemid', 'LEVEL1', 'LINKSTO']].set_index('itemid')
    

    return var_map

var_map = get_variable_mapping("itemid_to_variable_map.csv")
var_map

Unnamed: 0_level_0,LEVEL2,LEVEL1,LINKSTO
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50861,Alanine aminotransferase,Alanine aminotransferase,labevents
769,Alanine aminotransferase,Alanine aminotransferase,chartevents
220644,Alanine aminotransferase,Alanine aminotransferase,chartevents
50862,Albumin,Albumin,labevents
772,Albumin,Albumin,chartevents
...,...,...,...
40473,Urine output,Urine output,outputevents
40715,Urine output,Urine output,outputevents
43175,Urine output,Urine output,outputevents
226559,Urine output,Urine output (foley),outputevents


In [14]:
chartitems_to_keep = set(var_map.loc[var_map['LINKSTO'] == 'chartevents'].index)
#chartitems_to_keep = set([ str(i) for i in chartitems_to_keep ])

labitems_to_keep = set(var_map.loc[var_map['LINKSTO'] == 'labevents'].index)
#labitems_to_keep = set([ str(i) for i in labitems_to_keep ])


subject_id_set = set(icu_demographics_filter['subject_id'])
hadm_id_set = set(icu_demographics_filter['hadm_id'])
icustay_id_set = set(icu_demographics_filter['icustay_id'])

In [15]:
query = \
"""
select chartevents.subject_id, icustays.hadm_id, chartevents.icustay_id, chartevents.charttime, chartevents.itemid, chartevents.value, valueuom
FROM icustays 
INNER JOIN chartevents  ON icustays.icustay_id = chartevents.icustay_id
where chartevents.icustay_id in %(icu_ids)s
and chartevents.itemid in %(charts_to_keep)s
and chartevents.charttime between intime and outtime -- in between the icu stay
and chartevents.error is distinct from 1 -- filter wrong charts
and chartevents.valuenum is not null
UNION ALL
select distinct icustays.subject_id, icustays.hadm_id, icustays.icustay_id, labevents.charttime, labevents.itemid, labevents.value, valueuom
FROM icustays 
INNER JOIN labevents ON icustays.hadm_id = labevents.hadm_id
where icustays.icustay_id in %(icu_ids)s
and labevents.itemid in %(labs_to_keep)s
and labevents.charttime between (intime - interval '6' hour) and outtime
and labevents.valuenum > 0 -- filter wrong lab values
;
"""
labs_vitals= pd.read_sql_query(query, connection, params={'icu_ids': tuple(icustay_id_set), 'charts_to_keep': tuple(chartitems_to_keep), 'labs_to_keep': tuple(labitems_to_keep)})


### Preprocess labs and vitals 

#### hourly buckets

In [16]:
# the value is str, convert to numeric
labs_vitals['value'] = pd.to_numeric(labs_vitals['value'], 'coerce')
#ID_COLS = ['subject_id', 'hadm_id', 'icustay_id']
#ID_COLS = ['subject_id', 'hadm_id']
#ITEM_COLS = ['itemid', 'label', 'LEVEL1', 'LEVEL2']
#X = X.astype({k: int for k in ID_COLS})


# join and add in labs_vital the icu intime and outime, to separate in hourly buckets
icu_demographics_filter = icu_demographics_filter.set_index("icustay_id")
labs_vitals = labs_vitals.set_index('icustay_id').join(icu_demographics_filter[['intime', 'outtime']])


to_hours = lambda x: max(0, x.days*24 + x.seconds // 3600)

#divide into hourly buckers from intime to chartime 
#labs_vitals['hourly_buckets'] = (labs_vitals['charttime']-labs_vitals['intime'])/pd.Timedelta(minutes=60)
#labs_vitals['hourly_buckets'] = (labs_vitals['charttime']-labs_vitals['intime'])/3600
#labs_vitals['hourly_buckets'] = labs_vitals['hourly_buckets'].apply(lambda x: np.max(x))

labs_vitals['hours_in'] = (labs_vitals['charttime'] - labs_vitals['intime']).apply(to_hours)

In [17]:
labs_vitals.hours_in.min()

0

#### aggregate same item_ids according to the extract_mimic_paper

In [18]:
labs_vitals = labs_vitals.sort_values(by=['subject_id', 'charttime'])

# to filter the itemids table
itemids = set(labs_vitals.itemid.astype(str))

In [19]:
labs_vitals_copy = labs_vitals.copy(deep=True)

In [20]:
query_d_items = \
"""
SELECT itemid, label, dbsource, linksto, category, unitname
FROM d_items
WHERE itemid in ({itemids})
;
""".format(itemids=','.join(itemids))
items_ids = pd.read_sql_query(query_d_items, connection).set_index('itemid')


In [21]:
labs_vitals.drop(columns=['charttime', 'intime', 'outtime'], inplace=True)
labs_vitals.set_index('itemid', append=True, inplace=True)

#labs_vitals = labs_vitals.join(var_map).join(items_ids).set_index(['label', 'LEVEL1', 'LEVEL2'], append=True)
#standardize_units(X, name_col='LEVEL1', inplace=True)
#X = apply_variable_limits(X, var_ranges, 'LEVEL2')
labs_vitals = labs_vitals.join(var_map).join(items_ids).set_index(['label', 'LEVEL2'], append=True)

In [22]:
labs_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,subject_id,hadm_id,value,valueuom,hours_in,LEVEL1,LINKSTO,dbsource,linksto,category,unitname
icustay_id,itemid,label,LEVEL2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
211552,50868,,Anion gap,3,145834,17.0,mEq/L,0,Anion gap,labevents,,,,
211552,50882,,Bicarbonate,3,145834,25.0,mEq/L,0,Bicarbonate,labevents,,,,
211552,50893,,Calcium,3,145834,8.2,mg/dL,0,Calcium (total),labevents,,,,
211552,50902,,Chloride,3,145834,99.0,mEq/L,0,Chloride,labevents,,,,
211552,50912,,Creatinine,3,145834,3.2,mg/dL,0,Creatinine,labevents,,,,


In [23]:
# get the names of the aggregated labs and vitals
labs_vitals_names = list(labs_vitals.index.get_level_values('LEVEL2'))
labs_vitals = labs_vitals.groupby(['subject_id', 'hadm_id', 'icustay_id'] + ['LEVEL2'] + ['hours_in']).agg(['mean', 'std', 'count'])

In [24]:
labs_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,count
subject_id,hadm_id,icustay_id,LEVEL2,hours_in,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,145834,211552,Alanine aminotransferase,0,25.0,0.0,2
3,145834,211552,Alanine aminotransferase,32,20.0,0.0,2
3,145834,211552,Albumin,0,1.8,0.0,2
3,145834,211552,Alkaline phosphate,0,73.0,0.0,2
3,145834,211552,Alkaline phosphate,32,89.0,0.0,2


In [25]:
labs_vitals.columns = labs_vitals.columns.droplevel(0)

labs_vitals.columns.names = ['Aggregated']

icu_demographics_filter['max_hours'] = (icu_demographics_filter['outtime'] - icu_demographics_filter['intime']).apply(to_hours)

In [26]:
labs_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Aggregated,mean,std,count
subject_id,hadm_id,icustay_id,LEVEL2,hours_in,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,145834,211552,Alanine aminotransferase,0,25.0,0.0,2
3,145834,211552,Alanine aminotransferase,32,20.0,0.0,2
3,145834,211552,Albumin,0,1.8,0.0,2
3,145834,211552,Alkaline phosphate,0,73.0,0.0,2
3,145834,211552,Alkaline phosphate,32,89.0,0.0,2


In [27]:
icu_demographics_filter.head()

Unnamed: 0_level_0,subject_id,hadm_id,dob,dod,gender,ethnicity,diagnosis,intime,icu_length_of_stay,age,death_after_icu_hours,hospital_expire_flag,outtime,mort_icu,mort_hosp,max_hours
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
211552,3,145834,2025-04-11,2102-06-14,M,WHITE,HYPOTENSION,2101-10-20 19:10:11,6.0,76.734698,,0,2101-10-26 20:43:09,0,0,145
294638,4,185777,2143-05-12,NaT,F,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-16 00:29:31,1.0,47.976345,,0,2191-03-17 16:46:31,0,0,40
228232,6,107064,2109-06-21,NaT,F,WHITE,CHRONIC RENAL FAILURE/SDA,2175-05-30 21:30:54,4.0,66.120876,,0,2175-06-03 13:39:54,0,0,88
220597,9,150750,2108-01-26,2149-11-14,M,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,2149-11-09 13:07:02,5.0,41.90346,5.0,1,2149-11-14 20:52:14,1,1,127
229441,11,194540,2128-02-22,2178-11-14,F,WHITE,BRAIN MASS,2178-04-16 06:19:32,1.0,50.28525,,0,2178-04-17 20:21:05,0,0,38


#### Pivot the table

In [28]:
#missing_hours_fill = range_unnest(icu_demographics_filter, 'max_hours', out_col_name='hours_in', reset_index=True)
#missing_hours_fill['tmp'] = np.NaN


#fill_df = icu_demographics_filter.reset_index()[['subject_id', 'hadm_id', 'icustay_id']].join(missing_hours_fill.set_index('icustay_id'), on='icustay_id')
#fill_df.set_index(['subject_id', 'hadm_id', 'icustay_id'] + ['hours_in'], inplace=True)

# Pivot table droups NaN columns so you lose any uniformly NaN.
X = labs_vitals.unstack(level = ['LEVEL2'])
X.columns = X.columns.reorder_levels(order=['LEVEL2'] + ['Aggregated'])
   
#X = X.reindex(fill_df.index)


X = X.sort_index(axis=0).sort_index(axis=1)


In [29]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,Alanine aminotransferase,Alanine aminotransferase,Alanine aminotransferase,Albumin,Albumin,Albumin,Albumin ascites,Albumin ascites,Albumin ascites,Albumin pleural,...,White blood cell count,White blood cell count urine,White blood cell count urine,White blood cell count urine,pH,pH,pH,pH urine,pH urine,pH urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregated,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,25.0,0.0,2.0,1.8,0.0,,,,,...,4.012837,,,,9.0,7.40,0.147733,1.0,5.0,
3,145834,211552,1,,,,,,,,,,,...,,,,,,,,,,
3,145834,211552,2,,,,,,,,,,,...,,,,,3.0,7.26,0.000000,,,
3,145834,211552,3,,,,,,,,,,,...,,,,,,,,,,
3,145834,211552,4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,117381,234784,154,,,,,,,,,,,...,,,,,,,,,,
786,117381,234784,155,,,,,,,,,,,...,,,,,,,,,,
786,117381,234784,156,,,,,,,,,,,...,,,,,,,,,,
786,117381,234784,157,,,,,,,,,,,...,,,,,,,,,,


In [30]:
icu_demographics_filter

Unnamed: 0_level_0,subject_id,hadm_id,dob,dod,gender,ethnicity,diagnosis,intime,icu_length_of_stay,age,death_after_icu_hours,hospital_expire_flag,outtime,mort_icu,mort_hosp,max_hours
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
211552,3,145834,2025-04-11,2102-06-14,M,WHITE,HYPOTENSION,2101-10-20 19:10:11,6.0,76.734698,,0,2101-10-26 20:43:09,0,0,145
294638,4,185777,2143-05-12,NaT,F,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-16 00:29:31,1.0,47.976345,,0,2191-03-17 16:46:31,0,0,40
228232,6,107064,2109-06-21,NaT,F,WHITE,CHRONIC RENAL FAILURE/SDA,2175-05-30 21:30:54,4.0,66.120876,,0,2175-06-03 13:39:54,0,0,88
220597,9,150750,2108-01-26,2149-11-14,M,UNKNOWN/NOT SPECIFIED,HEMORRHAGIC CVA,2149-11-09 13:07:02,5.0,41.903460,5.0,1,2149-11-14 20:52:14,1,1,127
229441,11,194540,2128-02-22,2178-11-14,F,WHITE,BRAIN MASS,2178-04-16 06:19:32,1.0,50.285250,,0,2178-04-17 20:21:05,0,0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275387,779,197527,2077-03-13,NaT,M,WHITE,UPPER GASTROINTESTINAL BLEED,2152-10-25 13:42:25,2.0,75.825962,,0,2152-10-27 16:55:02,0,0,51
234479,780,140935,2102-03-21,2195-09-02,F,OTHER,CHEST PAIN,2188-05-08 05:10:00,1.0,86.371149,,0,2188-05-09 18:15:00,0,0,37
264637,782,125662,2095-04-08,2179-07-19,M,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,2176-02-12 12:45:03,4.0,81.069728,,0,2176-02-16 14:51:11,0,0,98
228499,785,192508,2062-04-30,NaT,F,WHITE,ABDOMINAL AORTIC ANEURYSM\ABDOMINAL AORTIC ANE...,2144-02-25 10:09:53,8.0,82.047101,,0,2144-03-04 13:30:27,0,0,195


#### Drop columns few recordings

In [76]:
threshold = 0.9
columns_to_drop = []
for column_name in X.columns:
    #print(column_name)
    if column_name[1] == 'mean':
        #get the only the means of the vital_labs
        if X[column_name].isnull().mean() > threshold:
            columns_to_drop.append(column_name[0])

X = X.drop(columns=columns_to_drop)

#### Apply variable limits - outliers

In [None]:
def apply_variable_limits(df, var_ranges, var_names_index_col='LEVEL2'):
    idx_vals        = df.index.get_level_values(var_names_index_col)
    non_null_idx    = ~df.value.isnull()
    var_names       = set(idx_vals)
    var_range_names = set(var_ranges.index.values)

    for var_name in var_names:
        var_name_lower = var_name.lower()
        if var_name_lower not in var_range_names:
            print("No known ranges for %s" % var_name)
            continue

        outlier_low_val, outlier_high_val, valid_low_val, valid_high_val = [
            var_ranges.loc[var_name_lower, x] for x in ('OUTLIER_LOW','OUTLIER_HIGH','VALID_LOW','VALID_HIGH')
        ]

        running_idx = non_null_idx & (idx_vals == var_name)

        outlier_low_idx  = (df.value < outlier_low_val)
        outlier_high_idx = (df.value > outlier_high_val)
        valid_low_idx    = ~outlier_low_idx & (df.value < valid_low_val)
        valid_high_idx   = ~outlier_high_idx & (df.value > valid_high_val)

        var_outlier_idx   = running_idx & (outlier_low_idx | outlier_high_idx)
        var_valid_low_idx = running_idx & valid_low_idx
        var_valid_high_idx = running_idx & valid_high_idx

        df.loc[var_outlier_idx, 'value'] = np.nan
        df.loc[var_valid_low_idx, 'value'] = valid_low_val
        df.loc[var_valid_high_idx, 'value'] = valid_high_val

        n_outlier = sum(var_outlier_idx)
        n_valid_low = sum(var_valid_low_idx)
        n_valid_high = sum(var_valid_high_idx)
        if n_outlier + n_valid_low + n_valid_high > 0:
            print(
                "%s had %d / %d rows cleaned:\n"
                "  %d rows were strict outliers, set to np.nan\n"
                "  %d rows were low valid outliers, set to %.2f\n"
                "  %d rows were high valid outliers, set to %.2f\n"
                "" % (
                    var_name,
                    n_outlier + n_valid_low + n_valid_high, sum(running_idx),
                    n_outlier, n_valid_low, valid_low_val, n_valid_high, valid_high_val
                )
            )

    return df


## Vasopressors 

In [90]:
### from metavision
## from https://gitlab.doc.ic.ac.uk/AIClinician/AIClinician/-/blob/master/AIClinician_Data_extract_MIMIC3_140219.ipynb
query = """
select inputevents_mv.subject_id, inputevents_mv.hadm_id, inputevents_mv.icustay_id, inputevents_mv.itemid, inputevents_mv.starttime, inputevents_mv.endtime, -- rate, -- ,rateuom,
case when itemid in (30120,221906,30047) and rateuom='mcg/kg/min' then round(cast(rate as numeric),3)  -- norad
when itemid in (30120,221906,30047) and rateuom='mcg/min' then round(cast(rate/80 as numeric),3)  -- norad
when itemid in (30119,221289) and rateuom='mcg/kg/min' then round(cast(rate as numeric),3) -- epi
when itemid in (30119,221289) and rateuom='mcg/min' then round(cast(rate/80 as numeric),3) -- epi
when itemid in (30051,222315) and rate > 0.2 then round(cast(rate*5/60  as numeric),3) -- vasopressin, in U/h
when itemid in (30051,222315) and rateuom='units/min' then round(cast(rate*5 as numeric),3) -- vasopressin
when itemid in (30051,222315) and rateuom='units/hour' then round(cast(rate*5/60 as numeric),3) -- vasopressin
when itemid in (30128,221749,30127) and rateuom='mcg/kg/min' then round(cast(rate*0.45 as numeric),3) -- phenyl
when itemid in (30128,221749,30127) and rateuom='mcg/min' then round(cast(rate*0.45 / 80 as numeric),3) -- phenyl
when itemid in (221662,30043,30307) and rateuom='mcg/kg/min' then round(cast(rate*0.01 as numeric),3)  -- dopa
when itemid in (221662,30043,30307) and rateuom='mcg/min' then round(cast(rate*0.01/80 as numeric),3) else null end as rate_std-- dopa
, case when inputevents_mv.subject_id in %(subject_ids)s then 1 else 0 end as has_vasopressors-- binary indicator of whether patients received vassos
from inputevents_mv
INNER JOIN icustays  ON icustays.icustay_id = inputevents_mv.icustay_id
where itemid in (30128,30120,30051,221749,221906,30119,30047,30127,221289,222315,221662,30043,30307) and rate is not null and statusdescription <> 'Rewritten'
and inputevents_mv.icustay_id in %(icu_ids)s

--and inputevents_mv.starttime between icustays.intime and icustays.outtime -- in between the icu stay
order by icustay_id, itemid, starttime

"""

voso_mv = pd.read_sql_query(query,connection, params={'subject_ids': tuple(subject_id_set), 'icu_ids': tuple(icustay_id_set)})

In [91]:
voso_mv

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,starttime,endtime,rate_std,has_vasopressors


In [85]:
## carevure
## from https://gitlab.doc.ic.ac.uk/AIClinician/AIClinician/-/blob/master/AIClinician_Data_extract_MIMIC3_140219.ipynb

query = """
select inputevents_cv.subject_id, inputevents_cv.hadm_id, inputevents_cv.icustay_id,  inputevents_cv.itemid, inputevents_cv.charttime, inputevents_cv.rate, -- rate, -- rateuom,

case when itemid in (30120,221906,30047) and rateuom='mcgkgmin' then round(cast(rate as numeric),3) -- norad
when itemid in (30120,221906,30047) and rateuom='mcgmin' then round(cast(rate/80 as numeric),3)  -- norad
when itemid in (30119,221289) and rateuom='mcgkgmin' then round(cast(rate as numeric),3) -- epi
when itemid in (30119,221289) and rateuom='mcgmin' then round(cast(rate/80 as numeric),3) -- epi
when itemid in (30051,222315) and rate > 0.2 then round(cast(rate*5/60  as numeric),3) -- vasopressin, in U/h
when itemid in (30051,222315) and rateuom='Umin' and rate < 0.2 then round(cast(rate*5  as numeric),3) -- vasopressin
when itemid in (30051,222315) and rateuom='Uhr' then round(cast(rate*5/60  as numeric),3) -- vasopressin
when itemid in (30128,221749,30127) and rateuom='mcgkgmin' then round(cast(rate*0.45  as numeric),3) -- phenyl
when itemid in (30128,221749,30127) and rateuom='mcgmin' then round(cast(rate*0.45 / 80  as numeric),3) -- phenyl
when itemid in (221662,30043,30307) and rateuom='mcgkgmin' then round(cast(rate*0.01   as numeric),3) -- dopa
when itemid in (221662,30043,30307) and rateuom='mcgmin' then round(cast(rate*0.01/80  as numeric),3) else null end as rate_std-- dopa
, case when inputevents_cv.subject_id in %(subject_ids)s then 1 else 0 end as has_vasopressors-- binary indicator of whether patients received vassos

-- case when rateuom='mcgkgmin' then 1 when rateuom='mcgmin' then 2 end as uom
from inputevents_cv
INNER JOIN icustays  ON icustays.icustay_id = inputevents_cv.icustay_id


where itemid in (30128,30120,30051,221749,221906,30119,30047,30127,221289,222315,221662,30043,30307) and rate is not null
and inputevents_cv.icustay_id in %(icu_ids)s

and inputevents_cv.charttime between icustays.intime and icustays.outtime -- in between the icu stay


order by inputevents_cv.icustay_id, inputevents_cv.itemid, inputevents_cv.charttime

"""
vaso_cv = pd.read_sql_query(query,connection, params={'subject_ids': tuple(subject_id_set), 'icu_ids': tuple(icustay_id_set)})

In [86]:
len(set(vaso_cv.subject_id))

178

In [87]:
vaso_cv

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,rate,rate_std,has_vasopressors
0,603,104325.0,200168,30128,2112-09-02 17:00:00,0.3,0.135,1
1,603,104325.0,200168,30128,2112-09-02 17:15:00,0.0,0.000,1
2,603,104325.0,200168,30128,2112-09-02 17:30:00,0.0,0.000,1
3,603,104325.0,200168,30128,2112-09-02 17:45:00,0.0,0.000,1
4,603,104325.0,200168,30128,2112-09-02 18:00:00,0.0,0.000,1
...,...,...,...,...,...,...,...,...
8416,265,101608.0,299976,30128,2149-02-20 17:00:00,0.3,0.135,1
8417,265,101608.0,299976,30128,2149-02-20 17:30:00,0.3,0.135,1
8418,265,101608.0,299976,30128,2149-02-20 18:00:00,0.3,0.135,1
8419,265,101608.0,299976,30128,2149-02-20 18:30:00,0.3,0.135,1


## Mechanical Ventilation

In [None]:
# Identify The presence of a mechanical ventilation using settings
# from https://github.com/MIT-LCP/mimic-code/blob/main/mimic-iii/concepts/durations/ventilation_classification.sql
query = """
select
  ce.subject_id, ce.hadm_id, ce.icustay_id, ce.charttime
  -- case statement determining whether it is an instance of mech vent
  , max(
    case
      when itemid is null or value is null then 0 -- can't have null values
      when itemid = 720 and value != 'Other/Remarks' THEN 1  -- VentTypeRecorded
      when itemid = 223848 and value != 'Other' THEN 1
      when itemid = 223849 then 1 -- ventilator mode
      when itemid = 467 and value = 'Ventilator' THEN 1 -- O2 delivery device == ventilator
      when itemid in
        (
        445, 448, 449, 450, 1340, 1486, 1600, 224687 -- minute volume
        , 639, 654, 681, 682, 683, 684,224685,224684,224686 -- tidal volume
        , 218,436,535,444,459,224697,224695,224696,224746,224747 -- High/Low/Peak/Mean/Neg insp force ("RespPressure")
        , 221,1,1211,1655,2000,226873,224738,224419,224750,227187 -- Insp pressure
        , 543 -- PlateauPressure
        , 5865,5866,224707,224709,224705,224706 -- APRV pressure
        , 60,437,505,506,686,220339,224700 -- PEEP
        , 3459 -- high pressure relief
        , 501,502,503,224702 -- PCV
        , 223,667,668,669,670,671,672 -- TCPCV
        , 224701 -- PSVlevel
        )
        THEN 1
      else 0
    end
    ) as MechVent
    , max(
      case
        -- initiation of oxygen therapy indicates the ventilation has ended
        when itemid = 226732 and value in
        (
          'Nasal cannula', -- 153714 observations
          'Face tent', -- 24601 observations
          'Aerosol-cool', -- 24560 observations
          'Trach mask ', -- 16435 observations
          'High flow neb', -- 10785 observations
          'Non-rebreather', -- 5182 observations
          'Venti mask ', -- 1947 observations
          'Medium conc mask ', -- 1888 observations
          'T-piece', -- 1135 observations
          'High flow nasal cannula', -- 925 observations
          'Ultrasonic neb', -- 9 observations
          'Vapomist' -- 3 observations
        ) then 1
        when itemid = 467 and value in
        (
          'Cannula', -- 278252 observations
          'Nasal Cannula', -- 248299 observations
          -- 'None', -- 95498 observations
          'Face Tent', -- 35766 observations
          'Aerosol-Cool', -- 33919 observations
          'Trach Mask', -- 32655 observations
          'Hi Flow Neb', -- 14070 observations
          'Non-Rebreather', -- 10856 observations
          'Venti Mask', -- 4279 observations
          'Medium Conc Mask', -- 2114 observations
          'Vapotherm', -- 1655 observations
          'T-Piece', -- 779 observations
          'Hood', -- 670 observations
          'Hut', -- 150 observations
          'TranstrachealCat', -- 78 observations
          'Heated Neb', -- 37 observations
          'Ultrasonic Neb' -- 2 observations
        ) then 1
      else 0
      end
    ) as OxygenTherapy
    , max(
      case when itemid is null or value is null then 0
        -- extubated indicates ventilation event has ended
        when itemid = 640 and value = 'Extubated' then 1
        when itemid = 640 and value = 'Self Extubation' then 1
      else 0
      end
      )
      as Extubated
    , max(
      case when itemid is null or value is null then 0
        when itemid = 640 and value = 'Self Extubation' then 1
      else 0
      end
      )
      as SelfExtubated
from chartevents ce
INNER JOIN icustays  ON icustays.icustay_id = ce.icustay_id
where ce.value is not null
and ce.icustay_id in %(icu_ids)s
and ce.charttime between icustays.intime and icustays.outtime -- in between the icu stay

-- exclude rows marked as error
and (ce.error != 1 or ce.error IS NULL)
and itemid in
(
    -- the below are settings used to indicate ventilation
      720, 223849 -- vent mode
    , 223848 -- vent type
    , 445, 448, 449, 450, 1340, 1486, 1600, 224687 -- minute volume
    , 639, 654, 681, 682, 683, 684,224685,224684,224686 -- tidal volume
    , 218,436,535,444,224697,224695,224696,224746,224747 -- High/Low/Peak/Mean ("RespPressure")
    , 221,1,1211,1655,2000,226873,224738,224419,224750,227187 -- Insp pressure
    , 543 -- PlateauPressure
    , 5865,5866,224707,224709,224705,224706 -- APRV pressure
    , 60,437,505,506,686,220339,224700 -- PEEP
    , 3459 -- high pressure relief
    , 501,502,503,224702 -- PCV
    , 223,667,668,669,670,671,672 -- TCPCV
    , 224701 -- PSVlevel

    -- the below are settings used to indicate extubation
    , 640 -- extubated

    -- the below indicate oxygen/NIV, i.e. the end of a mechanical vent event
    , 468 -- O2 Delivery Device#2
    , 469 -- O2 Delivery Mode
    , 470 -- O2 Flow (lpm)
    , 471 -- O2 Flow (lpm) #2
    , 227287 -- O2 Flow (additional cannula)
    , 226732 -- O2 Delivery Device(s)
    , 223834 -- O2 Flow

    -- used in both oxygen + vent calculation
    , 467 -- O2 Delivery Device
)
group by ce.subject_id, ce.hadm_id, ce.icustay_id, ce.charttime
UNION DISTINCT
-- add in the extubation flags from procedureevents_mv
-- note that we only need the start time for the extubation
-- (extubation is always charted as ending 1 minute after it started)
select
  procedureevents_mv.subject_id, procedureevents_mv.hadm_id, procedureevents_mv.icustay_id, starttime as charttime
  , 0 as MechVent
  , 0 as OxygenTherapy
  , 1 as Extubated
  , case when itemid = 225468 then 1 else 0 end as SelfExtubated
from procedureevents_mv
INNER JOIN icustays  ON icustays.icustay_id = procedureevents_mv.icustay_id
where itemid in
(
  227194 -- "Extubation"
, 225468 -- "Unplanned Extubation (patient-initiated)"
, 225477 -- "Unplanned Extubation (non-patient initiated)"
)
and procedureevents_mv.icustay_id in %(icu_ids)s
and procedureevents_mv.starttime between icustays.intime and icustays.outtime -- in between the icu stay
;"""

vt = pd.read_sql_query(query, connection, params={'icu_ids': tuple(icustay_id_set)})

In [81]:
## from https://gitlab.doc.ic.ac.uk/AIClinician/AIClinician/-/blob/master/AIClinician_Data_extract_MIMIC3_140219.ipynb

query = """

select
    ce.subject_id, ce.hadm_id, ce.icustay_id, ce.charttime    -- case statement determining whether it is an instance of mech vent
    , max(
      case
        when itemid is null or value is null then 0 -- can't have null values
        when itemid = 720 and value != 'Other/Remarks' THEN 1  -- VentTypeRecorded
        when itemid = 467 and value = 'Ventilator' THEN 1 -- O2 delivery device == ventilator
        when itemid in
          (
          445, 448, 449, 450, 1340, 1486, 1600, 224687 -- minute volume
          , 639, 654, 681, 682, 683, 684,224685,224684,224686 -- tidal volume
          , 218,436,535,444,459,224697,224695,224696,224746,224747 -- High/Low/Peak/Mean/Neg insp force ("RespPressure")
          , 221,1,1211,1655,2000,226873,224738,224419,224750,227187 -- Insp pressure
          , 543 -- PlateauPressure
          , 5865,5866,224707,224709,224705,224706 -- APRV pressure
          , 60,437,505,506,686,220339,224700 -- PEEP
          , 3459 -- high pressure relief
          , 501,502,503,224702 -- PCV
          , 223,667,668,669,670,671,672 -- TCPCV
          , 157,158,1852,3398,3399,3400,3401,3402,3403,3404,8382,227809,227810 -- ETT
          , 224701 -- PSVlevel
          )
          THEN 1
        else 0
      end
      ) as MechVent
      , max(
        case when itemid is null or value is null then 0
          when itemid = 640 and value = 'Extubated' then 1
          when itemid = 640 and value = 'Self Extubation' then 1
        else 0
        end
        )
        as Extubated
      , max(
        case when itemid is null or value is null then 0
          when itemid = 640 and value = 'Self Extubation' then 1
        else 0
        end
        )
        as SelfExtubated


  from chartevents ce
  INNER JOIN icustays  ON icustays.icustay_id = ce.icustay_id
  where ce.value is not null
  and ce.icustay_id in %(icu_ids)s
  and ce.charttime between icustays.intime and icustays.outtime -- in between the icu stay
  and ce.itemid in
  (
      640 -- extubated
      , 720 -- vent type
      , 467 -- O2 delivery device
      , 445, 448, 449, 450, 1340, 1486, 1600, 224687 -- minute volume
      , 639, 654, 681, 682, 683, 684,224685,224684,224686 -- tidal volume
      , 218,436,535,444,459,224697,224695,224696,224746,224747 -- High/Low/Peak/Mean/Neg insp force ("RespPressure")
      , 221,1,1211,1655,2000,226873,224738,224419,224750,227187 -- Insp pressure
      , 543 -- PlateauPressure
      , 5865,5866,224707,224709,224705,224706 -- APRV pressure
      , 60,437,505,506,686,220339,224700 -- PEEP
      , 3459 -- high pressure relief
      , 501,502,503,224702 -- PCV
      , 223,667,668,669,670,671,672 -- TCPCV
      , 157,158,1852,3398,3399,3400,3401,3402,3403,3404,8382,227809,227810 -- ETT
      , 224701 -- PSVlevel
  )
  group by ce.subject_id, ce.hadm_id, ce.icustay_id, ce.charttime


"""
mech_vent = pd.read_sql_query(query,connection, params={'icu_ids': tuple(icustay_id_set)})

In [82]:
mech_vent

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,mechvent,extubated,selfextubated
0,3,145834,211552,2101-10-20 19:15:00,0,0,0
1,3,145834,211552,2101-10-20 20:00:00,1,0,0
2,3,145834,211552,2101-10-20 21:15:00,0,0,0
3,3,145834,211552,2101-10-20 23:15:00,1,0,0
4,3,145834,211552,2101-10-21 02:00:00,1,0,0
...,...,...,...,...,...,...,...
10683,786,117381,234784,2116-01-04 20:00:00,0,0,0
10684,786,117381,234784,2116-01-05 00:00:00,0,0,0
10685,786,117381,234784,2116-01-05 08:00:00,0,0,0
10686,786,117381,234784,2116-01-05 09:00:00,0,0,0


## Crystalloid and colloid bolus fluids

In [135]:
## colloid
## from https://github.com/MIT-LCP/mimic-code/blob/main/mimic-iii/concepts/fluid_balance/crystalloid_bolus.sql
## Fluid bolus therapy is widely administered to patients with undifferentiated hypotension and for patients with severe sepsis
query = """

-- received colloid before admission
-- 226365  --  OR Colloid Intake
-- 226376  --  PACU Colloid Intake

with t1 as
(
  select
    mv.subject_id
  , mv.hadm_id  
  , mv.icustay_id
  , mv.starttime as charttime
  -- standardize the units to millilitres
  -- also metavision has floating point precision.. but we only care down to the mL
  , round(case
      when mv.amountuom = 'L'
        then mv.amount * 1000.0
      when mv.amountuom = 'ml'
        then mv.amount
    else null end) as amount
  from inputevents_mv mv
  INNER JOIN icustays ON icustays.icustay_id = mv.icustay_id
  where mv.itemid in
  (
    220864, --	Albumin 5%	7466 132 7466
    220862, --	Albumin 25%	9851 174 9851
    225174, --	Hetastarch (Hespan) 6%	82 1 82
    225795, --	Dextran 40	38 3 38
    225796  --  Dextran 70
    -- below ITEMIDs not in use
   -- 220861 | Albumin (Human) 20%
   -- 220863 | Albumin (Human) 4%
  )
  and mv.statusdescription != 'Rewritten'
  and mv.starttime between intime and outtime -- in between the icu stay
  and
  -- in MetaVision, these ITEMIDs never appear with a null rate
  -- so it is sufficient to check the rate is > 100
    (
      (mv.rateuom = 'mL/hour' and mv.rate > 100)
      OR (mv.rateuom = 'mL/min' and mv.rate > (100/60.0))
      OR (mv.rateuom = 'mL/kg/hour' and (mv.rate*mv.patientweight) > 100)
    )
)
, t2 as
(
  select
    cv.subject_id
  , cv.hadm_id  
  , cv.icustay_id
  , cv.charttime
  -- carevue always has units in millilitres (or null)
  , round(cv.amount) as amount
  from inputevents_cv cv
  INNER JOIN icustays  ON icustays.icustay_id = cv.icustay_id

  where cv.itemid in
  (
   30008 --	Albumin 5%
  ,30009 --	Albumin 25%
  ,42832 --	albumin 12.5%
  ,40548 --	ALBUMIN
  ,45403 --	albumin
  ,44203 --	Albumin 12.5%
  ,30181 -- Serum Albumin 5%
  ,46564 -- Albumin
  ,43237 -- 25% Albumin
  ,43353 -- Albumin (human) 25%

  ,30012 --	Hespan
  ,46313 --	6% Hespan

  ,30011 -- Dextran 40
  ,30016 -- Dextrose 10%
  ,42975 --	DEXTRAN DRIP
  ,42944 --	dextran
  ,46336 --	10% Dextran 40/D5W
  ,46729 --	Dextran
  ,40033 --	DEXTRAN
  ,45410 --	10% Dextran 40
  ,42731 -- Dextran40 10%
  )
  and cv.amount > 100
  and cv.amount < 2000
  and cv.charttime between intime and outtime -- in between the icu stay

)

-- some colloids are charted in chartevents
, t3 as
(
  select
    ce.subject_id
  , ce.hadm_id 
  , ce.icustay_id
  , ce.charttime
  -- carevue always has units in millilitres (or null)
  , round(ce.valuenum) as amount
  from chartevents ce
  INNER JOIN icustays  ON icustays.icustay_id = ce.icustay_id
  where ce.itemid in
  (
      2510 --	DEXTRAN LML 10%
    , 3087 --	DEXTRAN 40  10%
    , 6937 --	Dextran
    , 3087 -- DEXTRAN 40  10%
    , 3088 --	DEXTRAN 40%
  )
  and ce.valuenum is not null
  and ce.valuenum > 100
  and ce.valuenum < 2000
  and ce.charttime between intime and outtime -- in between the icu stay

)

select
    subject_id
  ,  hadm_id
  ,  icustay_id
  , charttime
  , sum(amount) as colloid_bolus
from t1
-- just because the rate was high enough, does *not* mean the final amount was
where amount > 100
--and icustay_id in %(icu_ids)s
group by t1.subject_id, t1.hadm_id, t1.icustay_id, t1.charttime
UNION ALL
select
    subject_id
  ,  hadm_id
  ,  icustay_id
  , charttime
  , sum(amount) as colloid_bolus
from t2
--where icustay_id in %(icu_ids)s
group by t2.subject_id, t2.hadm_id, t2.icustay_id, t2.charttime
UNION ALL 
select
    subject_id
  ,  hadm_id
  ,  icustay_id
  , charttime
  , sum(amount) as colloid_bolus
from t3
--where icustay_id in %(icu_ids)s
group by t3.subject_id, t3.hadm_id, t3.icustay_id, t3.charttime
order by subject_id, hadm_id, icustay_id, charttime;
"""
colloid_bolus= pd.read_sql(query, connection)
#, params={'icu_ids': tuple(icustay_id_set)}

In [124]:
colloid_bolus

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,colloid_bolus
0,20,157681.0,264490,2183-04-28 23:30:00,500.0
1,36,122659.0,211200,2131-05-18 23:00:00,500.0
2,114,178393.0,258626,2146-08-30 05:00:00,500.0
3,114,178393.0,258626,2146-08-30 12:00:00,500.0
4,115,114585.0,232514,2194-10-17 16:00:00,500.0
...,...,...,...,...,...
12174,99982,151454.0,221194,2156-11-29 21:35:00,500.0
12175,99982,151454.0,221194,2156-11-29 22:16:00,500.0
12176,99982,151454.0,221194,2156-11-29 22:56:00,500.0
12177,99982,151454.0,221194,2156-12-04 11:26:00,250.0


In [137]:
# apply mask to receive patients with the queried disease and the specific hadm_id (visit in icu) that the disease was registered. 
colloid_bolus_f = colloid_bolus[colloid_bolus.icustay_id.isin(list(icustay_id_set))]

In [138]:
colloid_bolus_f

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,colloid_bolus
0,20,157681.0,264490,2183-04-28 23:30:00,500.0
2,114,178393.0,258626,2146-08-30 05:00:00,500.0
3,114,178393.0,258626,2146-08-30 12:00:00,500.0
4,115,114585.0,232514,2194-10-17 16:00:00,500.0
5,115,114585.0,232514,2194-10-17 22:00:00,500.0
18,151,151029.0,263211,2145-05-11 02:00:00,500.0
29,245,105501.0,216674,2172-07-09 09:00:00,250.0
30,245,105501.0,216674,2172-07-09 15:00:00,250.0
31,245,105501.0,216674,2172-07-10 11:30:00,250.0
34,353,159730.0,265453,2148-06-26 22:00:00,500.0


In [144]:
colloid_bolus_f[colloid_bolus_f.icustay_id == 226014]

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,colloid_bolus
89,728,174088.0,226014,2154-03-15 22:00:00,250.0
90,728,174088.0,226014,2154-03-15 23:00:00,250.0


In [150]:
## crystalloid
## Crystalloid fluids are a subset of intravenous solutions that are frequently used in the clinical setting. 
# Crystalloid fluids are the first choice for fluid resuscitation in the presence of hypovolemia, hemorrhage, 
# sepsis, and dehydration

query = """
with t1 as
(
  select
    mv.subject_id
  ,  mv.hadm_id
  ,  mv.icustay_id
  , mv.starttime as charttime
  -- standardize the units to millilitres
  -- also metavision has floating point precision.. but we only care down to the mL
  , round(case
      when mv.amountuom = 'L'
        then mv.amount * 1000.0
      when mv.amountuom = 'ml'
        then mv.amount
    else null end) as amount
  from inputevents_mv mv
  INNER JOIN icustays ON icustays.icustay_id = mv.icustay_id
  where mv.itemid in
  (
    -- 225943 Solution
    225158, -- NaCl 0.9%
    225828, -- LR
    225944, -- Sterile Water
    225797, -- Free Water
	  225159, -- NaCl 0.45%
	  -- 225161, -- NaCl 3% (Hypertonic Saline)
	  225823, -- D5 1/2NS
	  225825, -- D5NS
	  225827, -- D5LR
	  225941, -- D5 1/4NS
	  226089 -- Piggyback
  )
  and mv.statusdescription != 'Rewritten'
  and mv.starttime between intime and outtime -- in between the icu stay
  and
  -- in MetaVision, these ITEMIDs appear with a null rate IFF endtime=starttime + 1 minute
  -- so it is sufficient to:
  --    (1) check the rate is > 240 if it exists or
  --    (2) ensure the rate is null and amount > 240 ml
    (
      (mv.rate is not null and mv.rateuom = 'mL/hour' and mv.rate > 248)
      OR (mv.rate is not null and mv.rateuom = 'mL/min' and mv.rate > (248/60.0))
      OR (mv.rate is null and mv.amountuom = 'L' and mv.amount > 0.248)
      OR (mv.rate is null and mv.amountuom = 'ml' and mv.amount > 248)
    )
)
, t2 as
(
  select
    cv.subject_id
  , cv.hadm_id  
  , cv.icustay_id
  , cv.charttime
  -- carevue always has units in millilitres
  , round(cv.amount) as amount
  from inputevents_cv cv
  INNER JOIN icustays  ON icustays.icustay_id = cv.icustay_id
  where cv.itemid in
  (
    30015 -- "D5/.45NS" -- mixed colloids and crystalloids
  , 30018 --	.9% Normal Saline
  , 30020 -- .45% Normal Saline
  , 30021 --	Lactated Ringers
  , 30058 --	Free Water Bolus
  , 30060 -- D5NS
  , 30061 -- D5RL
  , 30063 --	IV Piggyback
  , 30065 --	Sterile Water
  -- , 30143 -- 3% Normal Saline
  , 30159 -- D5 Ringers Lact.
  , 30160 -- D5 Normal Saline
  , 30169 --	Sterile H20_GU
  , 30190 -- NS .9%
  , 40850 --	ns bolus
  , 41491 --	fluid bolus
  , 42639 --	bolus
  , 42187 --	free h20
  , 43819 --	1:1 NS Repletion.
  , 41430 --	free water boluses
  , 40712 --	free H20
  , 44160 --	BOLUS
  , 42383 --	cc for cc replace
  , 42297 --	Fluid bolus
  , 42453 --	Fluid Bolus
  , 40872 --	free water
  , 41915 --	FREE WATER
  , 41490 --	NS bolus
  , 46501 --	H2O Bolus
  , 45045 --	WaterBolus
  , 41984 --	FREE H20
  , 41371 --	ns fluid bolus
  , 41582 --	free h20 bolus
  , 41322 --	rl bolus
  , 40778 --	Free H2O
  , 41896 --	ivf boluses
  , 41428 --	ns .9% bolus
  , 43936 --	FREE WATER BOLUSES
  , 44200 --	FLUID BOLUS
  , 41619 --	frfee water boluses
  , 40424 --	free H2O
  , 41457 --	Free H20 intake
  , 41581 --	Water bolus
  , 42844 --	NS fluid bolus
  , 42429 --	Free water
  , 41356 --	IV Bolus
  , 40532 --	FREE H2O
  , 42548 --	NS Bolus
  , 44184 --	LR Bolus
  , 44521 --	LR bolus
  , 44741 --	NS FLUID BOLUS
  , 44126 --	fl bolus
  , 44110 --	RL BOLUS
  , 44633 --	ns boluses
  , 44983 --	Bolus NS
  , 44815 --	LR BOLUS
  , 43986 --	iv bolus
  , 45079 --	500 cc ns bolus
  , 46781 --	lr bolus
  , 45155 --	ns cc/cc replacement
  , 43909 --	H20 BOlus
  , 41467 --	NS IV bolus
  , 44367 --	LR
  , 41743 --	water bolus
  , 40423 --	Bolus
  , 44263 --	fluid bolus ns
  , 42749 --	fluid bolus NS
  , 45480 --	500cc ns bolus
  , 44491 --	.9NS bolus
  , 41695 --	NS fluid boluses
  , 46169 --	free water bolus.
  , 41580 --	free h2o bolus
  , 41392 --	ns b
  , 45989 --	NS Fluid Bolus
  , 45137 --	NS cc/cc
  , 45154 --	Free H20 bolus
  , 44053 --	normal saline bolus
  , 41416 --	free h2o boluses
  , 44761 --	Free H20
  , 41237 --	ns fluid boluses
  , 44426 --	bolus ns
  , 43975 --	FREE H20 BOLUSES
  , 44894 --	N/s 500 ml bolus
  , 41380 --	nsbolus
  , 42671 --	free h2o
  )
  and cv.amount > 248
  and cv.amount <= 2000
  and cv.amountuom = 'ml'
  and cv.charttime between intime and outtime -- in between the icu stay

)
select
    subject_id
  , hadm_id 
  , icustay_id
  , charttime
  , sum(amount) as crystalloid_bolus
from t1
-- just because the rate was high enough, does *not* mean the final amount was
where amount > 248
group by t1.subject_id, t1.hadm_id, t1.icustay_id, t1.charttime
UNION
select
    subject_id
  , hadm_id 
  ,  icustay_id
  , charttime
  , sum(amount) as crystalloid_bolus
from t2
group by t2.subject_id, t2.hadm_id, t2.icustay_id, t2.charttime
order by subject_id, hadm_id, icustay_id, charttime;

"""
crystalloid_bolus= pd.read_sql(query, connection)




In [151]:
# apply mask to receive patients with the queried disease and the specific hadm_id (visit in icu) that the disease was registered. 
crystalloid_bolus_f = crystalloid_bolus[crystalloid_bolus.icustay_id.isin(list(icustay_id_set))]

In [152]:
crystalloid_bolus_f

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,crystalloid_bolus
0,3,145834.0,211552,2101-10-21 11:00:00,500.0
1,3,145834.0,211552,2101-10-21 14:00:00,820.0
2,3,145834.0,211552,2101-10-21 17:30:00,500.0
3,3,145834.0,211552,2101-10-21 20:00:00,500.0
4,3,145834.0,211552,2101-10-21 22:30:00,500.0
...,...,...,...,...,...
2589,786,117381.0,234784,2116-01-02 12:00:00,250.0
2590,786,117381.0,234784,2116-01-02 14:00:00,250.0
2591,786,117381.0,234784,2116-01-02 16:00:00,250.0
2592,786,117381.0,234784,2116-01-02 20:00:00,250.0


## Antiobiotics

In [162]:
query = """

with t1 as
(
  select
    subject_id, hadm_id, icustay_id, startdate, enddate, drug, drug_name_generic
    , route
    , case
      when lower(drug) like '%adoxa%' then 1
      when lower(drug) like '%ala-tet%' then 1
      when lower(drug) like '%alodox%' then 1
      when lower(drug) like '%amikacin%' then 1
      when lower(drug) like '%amikin%' then 1
      when lower(drug) like '%amoxicillin%' then 1
      when lower(drug) like '%amoxicillin%clavulanate%' then 1
      when lower(drug) like '%clavulanate%' then 1
      when lower(drug) like '%ampicillin%' then 1
      when lower(drug) like '%augmentin%' then 1
      when lower(drug) like '%avelox%' then 1
      when lower(drug) like '%avidoxy%' then 1
      when lower(drug) like '%azactam%' then 1
      when lower(drug) like '%azithromycin%' then 1
      when lower(drug) like '%aztreonam%' then 1
      when lower(drug) like '%axetil%' then 1
      when lower(drug) like '%bactocill%' then 1
      when lower(drug) like '%bactrim%' then 1
      when lower(drug) like '%bethkis%' then 1
      when lower(drug) like '%biaxin%' then 1
      when lower(drug) like '%bicillin l-a%' then 1
      when lower(drug) like '%cayston%' then 1
      when lower(drug) like '%cefazolin%' then 1
      when lower(drug) like '%cedax%' then 1
      when lower(drug) like '%cefoxitin%' then 1
      when lower(drug) like '%ceftazidime%' then 1
      when lower(drug) like '%cefaclor%' then 1
      when lower(drug) like '%cefadroxil%' then 1
      when lower(drug) like '%cefdinir%' then 1
      when lower(drug) like '%cefditoren%' then 1
      when lower(drug) like '%cefepime%' then 1
      when lower(drug) like '%cefotetan%' then 1
      when lower(drug) like '%cefotaxime%' then 1
      when lower(drug) like '%cefpodoxime%' then 1
      when lower(drug) like '%cefprozil%' then 1
      when lower(drug) like '%ceftibuten%' then 1
      when lower(drug) like '%ceftin%' then 1
      when lower(drug) like '%cefuroxime %' then 1
      when lower(drug) like '%cefuroxime%' then 1
      when lower(drug) like '%cephalexin%' then 1
      when lower(drug) like '%chloramphenicol%' then 1
      when lower(drug) like '%cipro%' then 1
      when lower(drug) like '%ciprofloxacin%' then 1
      when lower(drug) like '%claforan%' then 1
      when lower(drug) like '%clarithromycin%' then 1
      when lower(drug) like '%cleocin%' then 1
      when lower(drug) like '%clindamycin%' then 1
      when lower(drug) like '%cubicin%' then 1
      when lower(drug) like '%dicloxacillin%' then 1
      when lower(drug) like '%doryx%' then 1
      when lower(drug) like '%doxycycline%' then 1
      when lower(drug) like '%duricef%' then 1
      when lower(drug) like '%dynacin%' then 1
      when lower(drug) like '%ery-tab%' then 1
      when lower(drug) like '%eryped%' then 1
      when lower(drug) like '%eryc%' then 1
      when lower(drug) like '%erythrocin%' then 1
      when lower(drug) like '%erythromycin%' then 1
      when lower(drug) like '%factive%' then 1
      when lower(drug) like '%flagyl%' then 1
      when lower(drug) like '%fortaz%' then 1
      when lower(drug) like '%furadantin%' then 1
      when lower(drug) like '%garamycin%' then 1
      when lower(drug) like '%gentamicin%' then 1
      when lower(drug) like '%kanamycin%' then 1
      when lower(drug) like '%keflex%' then 1
      when lower(drug) like '%ketek%' then 1
      when lower(drug) like '%levaquin%' then 1
      when lower(drug) like '%levofloxacin%' then 1
      when lower(drug) like '%lincocin%' then 1
      when lower(drug) like '%macrobid%' then 1
      when lower(drug) like '%macrodantin%' then 1
      when lower(drug) like '%maxipime%' then 1
      when lower(drug) like '%mefoxin%' then 1
      when lower(drug) like '%metronidazole%' then 1
      when lower(drug) like '%minocin%' then 1
      when lower(drug) like '%minocycline%' then 1
      when lower(drug) like '%monodox%' then 1
      when lower(drug) like '%monurol%' then 1
      when lower(drug) like '%morgidox%' then 1
      when lower(drug) like '%moxatag%' then 1
      when lower(drug) like '%moxifloxacin%' then 1
      when lower(drug) like '%myrac%' then 1
      when lower(drug) like '%nafcillin sodium%' then 1
      when lower(drug) like '%nicazel doxy 30%' then 1
      when lower(drug) like '%nitrofurantoin%' then 1
      when lower(drug) like '%noroxin%' then 1
      when lower(drug) like '%ocudox%' then 1
      when lower(drug) like '%ofloxacin%' then 1
      when lower(drug) like '%omnicef%' then 1
      when lower(drug) like '%oracea%' then 1
      when lower(drug) like '%oraxyl%' then 1
      when lower(drug) like '%oxacillin%' then 1
      when lower(drug) like '%pc pen vk%' then 1
      when lower(drug) like '%pce dispertab%' then 1
      when lower(drug) like '%panixine%' then 1
      when lower(drug) like '%pediazole%' then 1
      when lower(drug) like '%penicillin%' then 1
      when lower(drug) like '%periostat%' then 1
      when lower(drug) like '%pfizerpen%' then 1
      when lower(drug) like '%piperacillin%' then 1
      when lower(drug) like '%tazobactam%' then 1
      when lower(drug) like '%primsol%' then 1
      when lower(drug) like '%proquin%' then 1
      when lower(drug) like '%raniclor%' then 1
      when lower(drug) like '%rifadin%' then 1
      when lower(drug) like '%rifampin%' then 1
      when lower(drug) like '%rocephin%' then 1
      when lower(drug) like '%smz-tmp%' then 1
      when lower(drug) like '%septra%' then 1
      when lower(drug) like '%septra ds%' then 1
      when lower(drug) like '%septra%' then 1
      when lower(drug) like '%solodyn%' then 1
      when lower(drug) like '%spectracef%' then 1
      when lower(drug) like '%streptomycin sulfate%' then 1
      when lower(drug) like '%sulfadiazine%' then 1
      when lower(drug) like '%sulfamethoxazole%' then 1
      when lower(drug) like '%trimethoprim%' then 1
      when lower(drug) like '%sulfatrim%' then 1
      when lower(drug) like '%sulfisoxazole%' then 1
      when lower(drug) like '%suprax%' then 1
      when lower(drug) like '%synercid%' then 1
      when lower(drug) like '%tazicef%' then 1
      when lower(drug) like '%tetracycline%' then 1
      when lower(drug) like '%timentin%' then 1
      when lower(drug) like '%tobi%' then 1
      when lower(drug) like '%tobramycin%' then 1
      when lower(drug) like '%trimethoprim%' then 1
      when lower(drug) like '%unasyn%' then 1
      when lower(drug) like '%vancocin%' then 1
      when lower(drug) like '%vancomycin%' then 1
      when lower(drug) like '%vantin%' then 1
      when lower(drug) like '%vibativ%' then 1
      when lower(drug) like '%vibra-tabs%' then 1
      when lower(drug) like '%vibramycin%' then 1
      when lower(drug) like '%zinacef%' then 1
      when lower(drug) like '%zithromax%' then 1
      when lower(drug) like '%zmax%' then 1
      when lower(drug) like '%zosyn%' then 1
      when lower(drug) like '%zyvox%' then 1
    else 0
    end as antibiotic
  from prescriptions
  where drug_type in ('MAIN','ADDITIVE')
  -- we exclude routes via the eye, ears, or topically
  and route not in ('OU','OS','OD','AU','AS','AD', 'TP')
  and lower(route) not like '%ear%'
  and lower(route) not like '%eye%'
  -- we exclude certain types of antibiotics: topical creams, gels, desens, etc
  and lower(drug) not like '%cream%'
  and lower(drug) not like '%desensitization%'
  and lower(drug) not like '%ophth oint%'
  and lower(drug) not like '%gel%'
  -- other routes not sure about...
  -- for sure keep: ('IV','PO','PO/NG','ORAL', 'IV DRIP', 'IV BOLUS')
  -- ? VT, PB, PR, PL, NS, NG, NEB, NAS, LOCK, J TUBE, IVT
  -- ? IT, IRR, IP, IO, INHALATION, IN, IM
  -- ? IJ, IH, G TUBE, DIALYS
  -- ?? enemas??
)
select t1.subject_id, t1.hadm_id, t1.icustay_id, t1.startdate, t1.enddate, t1.drug, t1.drug_name_generic
    , t1.route, t1.antibiotic
from t1
INNER JOIN icustays  ON icustays.icustay_id = t1.icustay_id
where antibiotic = 1
and startdate between intime and outtime -- in between the icu stay

--group by drug --, drug_name_generic
--order by numobs desc;


"""

antiobiotics= pd.read_sql(query, connection)

In [165]:
# apply mask to receive patients with the queried disease and the specific hadm_id (visit in icu) that the disease was registered. 
antiobiotics_f = antiobiotics[antiobiotics.icustay_id.isin(list(icustay_id_set))]
antiobiotics_f

Unnamed: 0,subject_id,hadm_id,icustay_id,startdate,enddate,drug,drug_name_generic,route,antibiotic
0,13,143045,263738,2167-01-09,2167-01-11,Vancomycin HCl,,IV,1
1,9,150750,220597,2149-11-10,2149-11-11,Levofloxacin,Levofloxacin,NG,1
2,4,185777,294638,2191-03-17,2191-03-18,Vancomycin HCl,,IV,1
3,12,112213,232669,2104-08-11,2104-08-11,Metronidazole,,IV,1
4,12,112213,232669,2104-08-11,2104-08-12,Levofloxacin,,IV,1
...,...,...,...,...,...,...,...,...,...
1536,771,173072,203392,2166-03-04,2166-03-04,Clindamycin,,IV,1
1551,771,173072,203392,2166-03-04,2166-03-05,Metronidazole,Metronidazole,PO,1
1552,771,173072,203392,2166-03-04,2166-03-05,Levofloxacin,Levofloxacin,PO,1
1553,771,173072,203392,2166-03-04,2166-03-05,Vancomycin HCl,,IV,1


## Diagnoses

In [166]:
diagnoses = pd.read_sql("""

WITH filter_diagnoses AS 
(
    SELECT diagnoses_icd.subject_id, diagnoses_icd.hadm_id, diagnoses_icd.icd9_code, d_icd_diagnoses.short_title as icd9_title
    FROM diagnoses_icd 
    INNER JOIN d_icd_diagnoses 
        ON diagnoses_icd.icd9_code = d_icd_diagnoses.icd9_code
    WHERE diagnoses_icd.subject_id IN %(subject_id_set)s
    AND diagnoses_icd.hadm_id IN %(hadm_id_set)s
)


SELECT *
FROM filter_diagnoses
ORDER BY subject_id

""", con= connection, params={'subject_id_set': tuple(subject_id_set), 'hadm_id_set': tuple(hadm_id_set)})

In [167]:
diagnoses

Unnamed: 0,subject_id,hadm_id,icd9_code,icd9_title
0,3,145834,5849,Acute kidney failure NOS
1,3,145834,4275,Cardiac arrest
2,3,145834,4254,Prim cardiomyopathy NEC
3,3,145834,6826,Cellulitis of leg
4,3,145834,4280,CHF NOS
...,...,...,...,...
4203,786,117381,42731,Atrial fibrillation
4204,786,117381,2875,Thrombocytopenia NOS
4205,786,117381,2851,Ac posthemorrhag anemia
4206,786,117381,2767,Hyperpotassemia


In [None]:
itemid_to_variable_map = pd.read_csv("itemid_to_variable_map.csv")
item_ids_mapping = itemid_to_variable_map.groupby("LEVEL2")['ITEMID'].apply(list).to_dict()
print(len(item_ids_mapping))
remove_list = ['Blood culture',  'Cardiac Index', 'Cardiac Murmur', 'Cholesterol Pleural', 'Code Status','Consciousness Level',
'Ectopy Frequency','Ectopy Type','Fall Risk','Glascow coma scale eye opening','Glascow coma scale motor response','Glascow coma scale verbal response','Glucose urine','Heart Rhythm','Lung Sounds',
'Orientation','Pacemaker','Pupillary response left','Pupillary response right','Pupillary size left','Pupillary size right','Riker-SAS Scale','Service Type','Skin Color','Skin Integrity',
'Total Protein Body Fluid','Total Protein Joint Fluid','Trach Size','Urine Appearance','Urine Color','Urine output','Ventilator Mode','Ventilator Type', "Calcium ionized"]

item_ids_mapping_red = dict([(key, val) for key, val in item_ids_mapping.items() if key not in remove_list])

print(len(item_ids_mapping_red))
item_ids_mapping_red["Calcium Ionized"] = [3766, 50808, 816, 225667]

import itertools
vitals_labs_to_keep_list = list(itertools.chain(*item_ids_mapping_red.values()))

In [None]:
def get_values_by_name_from_df_column_or_index(data_df, colname):
    """ Easily get values for named field, whether a column or an index
    Returns
    -------
    values : 1D array
    """
    try:
        values = data_df[colname]
    except KeyError as e:
        if colname in data_df.index.names:
            values = data_df.index.get_level_values(colname)
        else:
            raise e
    return 


    
UNIT_CONVERSIONS = [
    ('weight',                   'oz',  None,             lambda x: x/16.*0.45359237),
    ('weight',                   'lbs', None,             lambda x: x*0.45359237),
    ('fraction inspired oxygen', None,  lambda x: x > 1,  lambda x: x/100.),
    ('oxygen saturation',        None,  lambda x: x <= 1, lambda x: x*100.),
    ('temperature',              'f',   lambda x: x > 79, lambda x: (x - 32) * 5./9),
    ('height',                   'in',  None,             lambda x: x*2.54),
]
def standardize_units(X, name_col='itemid', unit_col='valueuom', value_col='value', inplace=True):
    if not inplace: X = X.copy()
    name_col_vals = get_values_by_name_from_df_column_or_index(X, name_col)
    unit_col_vals = get_values_by_name_from_df_column_or_index(X, unit_col)

    try:
        name_col_vals = name_col_vals.str
        unit_col_vals = unit_col_vals.str
    except:
        print("Can't call *.str")
        print(name_col_vals)
        print(unit_col_vals)
        raise

    #name_filter, unit_filter = [
    #    (lambda n: col.contains(n, case=False, na=False)) for col in (name_col_vals, unit_col_vals)
    #]
    # TODO(mmd): Why does the above not work, but the below does?
    name_filter = lambda n: name_col_vals.contains(n, case=False, na=False)
    unit_filter = lambda n: unit_col_vals.contains(n, case=False, na=False)

    for name, unit, rng_check_fn, convert_fn in UNIT_CONVERSIONS:
        name_filter_idx = name_filter(name)
        needs_conversion_filter_idx = name_filter_idx & False

        if unit is not None: needs_conversion_filter_idx |= name_filter(unit) | unit_filter(unit)
        if rng_check_fn is not None: needs_conversion_filter_idx |= rng_check_fn(X[value_col])

        idx = name_filter_idx & needs_conversion_filter_idx

        X.loc[idx, value_col] = convert_fn(X[value_col][idx])

    return X



In [None]:
def range_unnest(df, col, out_col_name=None, reset_index=False):
    assert len(df.index.names) == 1, "Does not support multi-index."
    if out_col_name is None: out_col_name = col

    col_flat = pd.DataFrame(
        [[i, x] for i, y in df[col].iteritems() for x in range(y+1)],
        columns=[df.index.names[0], out_col_name]
    )

    if not reset_index: col_flat = col_flat.set_index(df.index.names[0])
    return col_flat


In [None]:
import matplotlib.pyplot as plt 
def plot_variable_histograms(col_names, df):
    # Plot some of the data, just to make sure it looks ok
    for c, vals in df.iteritems():
        n = vals.dropna().count()
        if n < 2: continue

        # get median, variance, skewness
        med = vals.dropna().median()
        var = vals.dropna().var()
        skew = vals.dropna().skew()

        # plot
        fig = plt.figure(figsize=(13, 6))
        plt.subplots(figsize=(13,6))
        vals.dropna().plot.hist(bins=100, label='HIST (n={})'.format(n))

        # fake plots for KS test, median, etc
        plt.plot([], label=' ',color='lightgray')
        plt.plot([], label='Median: {}'.format(format(med,'.2f')),
                 color='lightgray')
        plt.plot([], label='Variance: {}'.format(format(var,'.2f')),
                 color='lightgray')
        plt.plot([], label='Skew: {}'.format(format(skew,'.2f')),
                 color='light:gray')

        # add title, labels etc.
        plt.title('{} measurements in ICU '.format(str(c)))
        plt.xlabel(str(c))
        plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
        plt.xlim(0, vals.quantile(0.99))
        #fig.savefig(os.path.join(outPath, (str(c) + '_HIST_.png')), bbox_inches='tight')

In [None]:
#var_names = list(X.columns.get_level_values('LEVEL2'))
#plot_variable_histograms(var_names, X)
var_names = ['White blood cell count']
#plot_variable_histograms(var_names, X)

    #############
# Print the total proportions!
rows, vars = X.shape
print('')
for l, vals in X.iteritems():
    ratio = 1.0 * vals.dropna().count() / rows
    print(str(l) + ': ' + str(round(ratio, 3)*100) + '% present')
"""
#############
# Print the per subject proportions!
df = X.groupby(['subject_id']).count()
for k in [1, 2, 3]:
    print('% of subjects had at least ' + str(k) + ' present')
    d = df > k
    d = d.sum(axis=0)
    d = d / len(df)
    d = d.reset_index()
    for index, row in d.iterrows():
        print(str(index) + ': ' + str(round(row[0], 3)*100) + '%')
    print('\n')
"""

print('Done!')