## Descriptions:
Imputation of missing ESI and HW
- Impute HW using some demographics variables
- Get first values of vital signs to impute ESI

**Inputs**:  
- `cohort_demoHW` contains all processed variables in demographics (including ESI) and height and weight
- `flowsheet` contrains vital signs


**Outputs**: 
* `cohort_imputed_ESIHW` 

### Importing R libraries

In [3]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
library(mice)

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

"package 'mice' was built under R version 4.0.5"

Attaching package: 'mice'


The following object is masked from 'package:stats':

    filter


The following objects are masked from 'package:base':

    cbind, rbind




In [5]:
# read inputs
datadir = "../../DataTD"
featuredir = "../../OutputTD/2_features"

cohort <- read.csv(file.path(featuredir, "cohort_demoHW.csv"))

# check for unique CSNs and MRNs
nrow(cohort) #45986
nrow(cohort %>% select(anon_id) %>% distinct()) #31484
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) #31484
nrow(cohort %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) #31484

colnames(cohort)

### Imputation for Height and Weight using just the demographics set

In [None]:
# this takes awhile(~4min)
# cohort_demo_clean <- read.csv(file.path(datadir, "cohort_demo_clean.csv"))
demo_clean <- cohort_demo_clean %>% select(gender, race, age, medis, Height, Weight)

md.pattern(demo_clean)
demo_mice <- mice(demo_clean, m=3, maxit=50, meth='pmm', seed=123)
demo_imp2 <- complete(demo_mice, 2)

In [None]:
summary(demo_imp2)

In [None]:
nrow(demo_imp2)
head(demo_imp2, n=1)

nrow(cohort_demo_clean)
head(cohort_demo_clean, n=1)

# here: keep bind the old set with the imputed set, just to compare
demo_imp2name <- demo_imp2 %>% select(Height, Weight) %>% 
                    rename(Height_i = Height, Weight_i = Weight)
cohort_demo_imputed_all <- bind_cols(cohort_demo_clean, demo_imp2name) 
head(cohort_demo_imputed_all, n=1)
colnames(cohort_demo_imputed_all)

In [None]:
# all means we have both the original values and the imputed values in this same dataset, 
# but it doesn't have the indicators of missingness
cohort_demo_imputed_all <- cohort_demo_imputed_all %>% 
    select(c(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, label_max24, admit_time, ESI, 
             gender, race, age, medis, English, Height, Height_i, Weight, Weight_i))

In [None]:
# only keep the imputed versions of Height and Weight and add indicators
cohort_demo_imputed <- cohort_demo_imputed_all %>% 
                        mutate(delta_H = ifelse(is.na(Height), 1, 0),
                               delta_W = ifelse(is.na(Weight), 1, 0)) %>%
                        select(-c(Height, Weight))
summary(cohort_demo_imputed %>% select(delta_H, delta_W))
head(cohort_demo_imputed, n=1)

In [None]:
# missing ESI
summary(cohort_demo_imputed)
apply(cohort_demo_clean[c("gender", "race")], 2, table)

In [None]:
# from the cohort_demo_clean, imputed height and weight, save the new cohort_demo table
write.csv(cohort_demo_imputed, file = file.path(datadir, "cohort_demo_imputedHW.csv"), row.names=FALSE)

### ESI -- from acuity_level encounter table
* use first vital values from vitals_clean to impute ESI
* this require the vitals_clean dataset to be done
* before: use cohort with at least 1 vs --> change: use cohort with a complete set of vs --> simple and complex models
* vs1st_complete.csv is already updated

In [None]:
cohort_demo %>% group_by(ESI, label_max24) %>% count() %>% group_by(ESI) %>% mutate(p = round(100* n/sum(n), 2))
# will need to impute with the first sets of vital signs

In [None]:
# cohort_demo_imputedHW %>% group_by(esi) %>% count(label_max24) # --> convert NA
cohort_demo %>% group_by(ESI, label_max24) %>%
                summarise(n_esi = n()) %>%
                ungroup() %>% 
                mutate(percent = 100*n_esi/sum(n_esi)) %>%
                select(ESI, label = label_max24, n_esi, percent)

#### NEED vs1st_complete dataset, get from vitals notebook

In [None]:
# use the vs1st_complete.csv --> 41654, better cohort for imputation
# if use vs1st.csv with GCS we have 43320, but remove these so only 43291
vs1st <- read.csv(file.path(datadir, "vs1st.csv"))

# vs1st <- read.csv(file.path(datadir, "vs1st_complete.csv"))
nrow(vs1st)

# this demo is not the same demo read from demographics, same as cohort_demo_imputed as above
demo <- read.csv(file.path(datadir, "cohort_demo_imputedHW.csv"))
nrow(demo)

In [None]:
# vs1wide <- vs1st %>% spread(features, first_val) %>% select(-GCS) # then need to filter out all NA as below
# vs1wide <- vs1wide %>% filter_at(vars(DBP, Pulse, RR, SBP, SpO2, Temp), any_vars(!is.na(.)))

# remove GCS (not many), and turn wide format for imputation
vs1wide <- vs1st %>% filter(features != "GCS") %>% spread(features, first_val)
head(vs1wide)
nrow(vs1wide) # 43291 --> now 41654 for complete set of VS with final cohort
nrow(vs1wide %>% filter_all(all_vars(is.na(.)))) # number of rows that have all NA
nrow(vs1wide %>% drop_na()) # number of rows that have no NA (complete cases) accross all cols

In [None]:
colnames(demo)

In [None]:
vs1demo <- left_join(vs1wide, demo)
nrow(vs1demo)
head(vs1demo)
colnames(vs1demo)

In [None]:
summary(vs1demo) # missing 1665 ESI

In [None]:
# this chunk takes a while (~ 5min)
# m refers to the number of imputed datasets. Five is the default --> takes too long
# meth='pmm' refers to the imputation method, predictive mean matching
# complete(, returns the 2nd completed data set)
vs1demo_imp <- vs1demo %>% select(ESI, gender, race, age, medis, Height_i, Weight_i, SBP, DBP, Pulse, RR, SpO2, Temp)

md.pattern(vs1demo_imp)
vs1demo_mice <- mice(vs1demo_imp, m=3, maxit=50, meth='pmm', seed=123)
vs1demo_imp2 <- complete(vs1demo_mice, 2)

In [None]:
summary(vs1demo_imp2)

In [None]:
# rename the imputed variables from the imputed data set with an added _i
# not doing this: SBP_i=SBP, DBP_i=DBP, Pulse_i=Pulse, RR_i=RR, SpO2_i=SpO2, Temp_i=Temp
vs1demo_imp_name <- vs1demo_imp2 %>% select(ESI_i=ESI) 

# bind the imputed ESI with the original data and add a missing indicator for ESI
vs1demo_all <- bind_cols(vs1demo, vs1demo_imp_name) %>% mutate(delta_ESI = ifelse(is.na(ESI), 1, 0))
colnames(vs1demo_all)

In [None]:
dim(vs1demo_all)
head(vs1demo_all %>% filter(delta_ESI ==1))

In [None]:
# rearrange all the columns 
cohort_demo <- vs1demo_all %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, label_max24, admit_time, 
                                      ESI_i, delta_ESI, gender, race, age, medis, English,  
                                      Height_i, delta_H, Weight_i, delta_W,
                                      SBP, DBP, Pulse, RR, SpO2, Temp)
dim(cohort_demo)

### One hot coding for gender and race
* Gender: simply 1 for female and 0 for male
* Race: one-hot coding as usual

In [None]:
# 1 for female and 0 for male:
cohort_demo <- cohort_demo %>% mutate(gender = ifelse(gender == "Male", 0, 1),
                                      race =  as.factor(race))
summary(cohort_demo %>% select(gender, race))

In [None]:
# onehot coding for race:
dummy <- dummyVars(~ race, data = cohort_demo) # if more ! gender + race
race_1hot <- data.frame(predict(dummy, newdata = cohort_demo))
cohort_demo <-  cohort_demo %>% select(-race) %>% bind_cols(race_1hot)
ncol(cohort_demo)

In [None]:
summary(cohort_demo)

In [None]:
nrow(cohort_demo)
head(cohort_demo, n=3)

In [None]:
# save file: all ESI (and first_val of vital signs imputed)
write.csv(cohort_demo, file.path(datadir, "cohort_demo_final.csv"), row.names=FALSE)

# this is the correct new cohort with at least one component of vital signs
# cohort_has_vs <- cohort_demo %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, label_max24, admit_time)

# nrow(cohort_has_vs)
# write.csv(cohort_has_vs, file.path(datadir, "cohort_has_vs.csv"), row.names=FALSE)

### Get the dataset for simple models:
A cohort with complete set of VS, with labels (43008 --> 41654), with the following features
* get back the demographics (with imputed ESI using 1st set of vs)
* vital signs (first values and summary statistics) only (no GCS)

In [None]:
vsum_wide <- read.csv(file.path(datadir, 'vitalsum_wide.csv'))
nrow(vsum_wide)

In [None]:
demos <- read.csv(file.path(datadir, "cohort_demo_final.csv"))
nrow(demos)
colnames(demos)

In [None]:
# add demographic features to this data with vital signs
demos <- demos %>% select(-c(inpatient_data_id_coded, label_max24, admit_time))
data_simple <- left_join(vsum_wide, demos)
dim(data_simple)
nrow(data_simple %>% select(pat_enc_csn_id_coded) %>% distinct())
colnames(data_simple)
summary(data_simple)

In [None]:
# update cohort with labels to include only patients with a complete set of VS
# cohort <- read.csv("./Data/cohort_labels.csv")
cohort <- read.csv(file.path(datadir, "labels_with_death_delta.csv")) 

dim(cohort)
tail(cohort, 10)

In [None]:
cohort_final <- data_simple %>% select(anon_id, pat_enc_csn_id_coded) %>% left_join(cohort) #%>%
#                 select(-c(int64_field_0))

dim(cohort_final)
head(cohort_final, n=1)

In [None]:
colnames(cohort_final)
summary(cohort_final)

In [None]:
# updata data simple to include labels:
data_simple <- left_join(cohort_final, data_simple)
dim(data_simple)
colnames(data_simple)

In [None]:
# need to check the label

In [None]:
# write.csv(cohort_final, file.path(datadir, 'cohort_final.csv'), row.names = FALSE)
# write.csv(data_simple, file.path(datadir, "data_simple.csv"), row.names=FALSE)

### EXTRA

In [None]:
# did not miss any BP
added_vs <- read.csv(file.path(datadir, "added_vs.csv"))
nrow(added_vs)
added_vs %>% count(row_disp_name)
head(added_vs)