### Descriptions:

JOIN all features with values together first, in the *long* format, to be merged with feature counts

**Inputs:** 
- `6_5_coh2_vitals`: has all processed vital signs (not the summary stats)
- `6_6_coh3_labs`: has all processed labs
- `6_8_coh4_all_imputedHWESI`: has demographics & imputed HW (cohort4_all) & imputed ESI (cohort3 and finalized w/ cohort4)
- `6_4_cohort4`: cohort3 with labels and not in old cohort

**Outputs:**
- `6_9_coh4_feature_values`: used cohort4, only new cohort, no overlapping!

### Importing R libraries

In [22]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Call back all datasets: 
* demographic with imputed HW and ESI
* vitals, cohort with at least a complete set of vs only
* labs, cohort with at least a lab result
* cohort_final

In [23]:
# read inputs
datadir6 = "../../DataTD/shc2021"
outdir = "../../OutputTD/shc2021"

vitals0 <- read.csv(file.path(outdir, '5_coh2_vitals.csv'))
nrow(vitals0) #1418197

labs0 <- read.csv(file.path(outdir, '6_coh3_labs.csv'))
nrow(labs0) #1887672

demos <- read.csv(file.path(outdir, '8_coh4_all_imputedHWESI.csv'))
nrow(demos) # all cohort4 52532

# only new cohort with new features (door-to-dispo time and note counts x3)
cohort4 <- read.csv(file.path(outdir, '11_coh4_noOR_noDKA_edtime_notecount.csv'))
nrow(cohort4) # 51846

In [24]:
names(cohort4)

In [25]:
# these come from different cohort
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 11094 different cohort
length(setdiff(cohort4$pat_enc_csn_id_coded, vitals0$pat_enc_csn_id_coded)) # 0

length(setdiff(vitals0$pat_enc_csn_id_coded, labs0$pat_enc_csn_id_coded)) # 5109 different cohort
length(setdiff(cohort4$pat_enc_csn_id_coded, labs0$pat_enc_csn_id_coded)) # 2593 less than before,, good

In [26]:
nrow(vitals0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(vitals0$pat_enc_csn_id_coded)) # 2nd cohort, larger 45613

nrow(demos %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(demos$pat_enc_csn_id_coded)) # 3nd cohort, 44258, using only those with a complete set of VS for ESI imputation

nrow(labs0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 41627
length(unique(labs0$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

nrow(cohort4 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 43980
length(unique(cohort4$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

length(setdiff(demos$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 278
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 1633
length(setdiff(labs0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 261

In [27]:
head(cohort4, 1)
head(demos, 1)
head(vitals0, 1)
head(labs0, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label,hours_in_ed,door_to_dispo_all_notes,door_to_dispo_rn_notes,door_to_dispo_md_notes
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00,0,0,0,0,0,0,20.37,33,23,8


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC1000116,131066472308,19328596,2015-01-28 00:46:00,0,3,0,1,38,1,0,154,0,73.05,0,0,0,0,1,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,template,features,units,recorded_time,feature_type,values
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,JC1000116,131066472308,19328596,2015-01-28 00:46:00+00:00,0,ICU Intake & Output,DBP,,2015-01-27 07:30:00+00:00,vitals,57


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,base_name,ord_value,values,reference_low,reference_high,reference_unit,result_in_range_yn,result_flag,result_time,feature_type
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JC1749400,131065392332,19198682,2015-01-13 13:49:00+00:00,1,pH_a,PHA,7.32,7.32,7.35,7.45,,,Low,2015-01-13 09:21:00+00:00,labs


In [28]:
colnames(cohort4)
colnames(demos)
colnames(vitals0)
colnames(labs0)

# cohort <- cohort %>% mutate(admit_time = ymd_hms(admit_time)) 

In [31]:
# cohort <- cohort4 %>% select(anon_id, pat_enc_csn_id_coded, admit_time, first_label, death_24hr_recent_label, death_24hr_max_label)
cohort <- cohort4 %>% select(anon_id, pat_enc_csn_id_coded, admit_time, first_label,
                             death_3hr_recent_label, death_6hr_recent_label, death_9hr_recent_label,
                             death_12hr_recent_label, death_24hr_recent_label)
nrow(cohort)

In [32]:
time_count <- cohort4 %>% select(anon_id, pat_enc_csn_id_coded, ed_hour = hours_in_ed, 
                                 rn_note=door_to_dispo_rn_notes, md_note=door_to_dispo_md_notes,
                                 all_note=door_to_dispo_all_notes) %>%
                mutate(feature_type = 'edtime_notecount')
nrow(time_count)
sum(is.na(time_count)) # na was converted to 0, no notes
summary(time_count)

# turn to long format
tc_long <- gather(time_count, features, values, ed_hour:all_note, factor_key=TRUE) %>%
                mutate(feature_type = "time_count")
nrow(tc_long)
nrow(time_count) * 4

   anon_id          pat_enc_csn_id_coded    ed_hour           rn_note       
 Length:51846       Min.   :1.311e+11    Min.   :  0.020   Min.   :  0.000  
 Class :character   1st Qu.:1.312e+11    1st Qu.:  2.300   1st Qu.:  1.000  
 Mode  :character   Median :1.313e+11    Median :  3.500   Median :  3.000  
                    Mean   :1.312e+11    Mean   :  4.816   Mean   :  4.449  
                    3rd Qu.:1.313e+11    3rd Qu.:  5.250   3rd Qu.:  5.000  
                    Max.   :1.313e+11    Max.   :190.430   Max.   :181.000  
    md_note           all_note      feature_type      
 Min.   :  0.000   Min.   :  0.00   Length:51846      
 1st Qu.:  1.000   1st Qu.:  4.00   Class :character  
 Median :  3.000   Median :  7.00   Mode  :character  
 Mean   :  4.466   Mean   : 10.02                     
 3rd Qu.:  6.000   3rd Qu.: 12.00                     
 Max.   :156.000   Max.   :265.00                     

In [33]:
# turn the wide format to long for demographics
demo_long <- gather(demos, features, values, ESI_i:race.White, factor_key=TRUE) %>%
                mutate(feature_type = "demo") %>% 
                select(-admit_time, -label, -inpatient_data_id_coded) %>% #mutate(admit_time = ymd_hms(admit_time)) %>% 
             right_join(cohort)

sum(is.na(demos))
nrow(demo_long) # = nrow(cohort) *17 = 881382
nrow(cohort) * 17

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [34]:
head(demo_long, 2)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,features,values,feature_type,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label
Unnamed: 0_level_1,<chr>,<dbl>,<fct>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC1000116,131066472308,ESI_i,3,demo,2015-01-28 00:46:00,0,0,0,0,0,0
2,JC1000116,131295313275,ESI_i,3,demo,2020-09-29 22:45:00,0,0,0,0,0,0


In [35]:
# drop admit_time from vitals and labs because they might be in different formats utc -- mess up the merge
vitals <- vitals0 %>% select(anon_id, pat_enc_csn_id_coded, features, values, feature_type, 
                             time = recorded_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>%
                         right_join(cohort) #%>% drop_na()
labs <- labs0 %>% select(anon_id, pat_enc_csn_id_coded, features, values, feature_type, 
                         time = result_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>% 
                    right_join(cohort) %>% drop_na() # due to csn without labs in cohort4

Joining, by = c("anon_id", "pat_enc_csn_id_coded")

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [36]:
length(unique(demo_long$pat_enc_csn_id_coded))
length(unique(vitals$pat_enc_csn_id_coded))
length(unique(labs$pat_enc_csn_id_coded)) # 49253 due to drop NA
length(unique(tc_long$pat_enc_csn_id_coded)) # 51846

head(demo_long, n=1)
head(vitals, n=1)
head(labs, n=1)
head(tc_long, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,features,values,feature_type,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label
Unnamed: 0_level_1,<chr>,<dbl>,<fct>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC1000116,131066472308,ESI_i,3,demo,2015-01-28 00:46:00,0,0,0,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,features,values,feature_type,time,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC1000116,131066472308,DBP,57,vitals,2015-01-27 07:30:00+00:00,2015-01-28 00:46:00,0,0,0,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,features,values,feature_type,time,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC559806,131126000000.0,TCO2_a,30,labs,2015-10-27 06:24:00+00:00,2015-10-27 06:57:00,1,1,1,1,1,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<fct>,<dbl>
1,JC1000116,131066472308,time_count,ed_hour,20.37


In [37]:
summary(tc_long$values)
summary(demo_long$values)
summary(vitals$values)
summary(labs$values) # negatives

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   2.000   4.000   5.938   7.000 265.000 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    0.00    0.00   18.19    1.00  322.00 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   36.70   72.00   72.28  102.00  419.00 

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  -30.00     3.40    12.70    38.75    38.00 11900.00 

In [38]:
# combine demos, vitals and labs, long format, with "time"
feats <- bind_rows(demo_long, tc_long, vitals, labs)
feats <- as.data.frame(unclass(feats))

nrow(feats) #3711563
nrow(feats %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 51846
nrow(feats %>% select(pat_enc_csn_id_coded) %>% distinct()) # 51846

feats %>% count(feature_type) # shouldn't have NA
summary(feats$values) 

feature_type,n
<chr>,<int>
demo,881382
labs,1610383
time_count,207384
vitals,1167952


    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  -30.00     1.02    17.00    42.43    72.00 11900.00 

In [39]:
feats %>% group_by(feature_type, features) %>% count()

feature_type,features,n
<chr>,<chr>,<int>
demo,age,51846
demo,delta_ESI,51846
demo,delta_H,51846
demo,delta_W,51846
demo,English,51846
demo,ESI_i,51846
demo,gender,51846
demo,Height_i,51846
demo,insurance,51846
demo,race.Asian,51846


In [40]:
names(feats)

In [41]:
# rearrange columns
cohort_feats <- feats %>% select(anon_id, pat_enc_csn_id_coded, admit_time, first_label, 
                                 death_3hr_recent_label, death_6hr_recent_label, death_9hr_recent_label, 
                                 death_12hr_recent_label, death_24hr_recent_label,
                                 feature_type, features, values, time)
nrow(cohort_feats) # 3867101 
length(unique(cohort_feats$pat_enc_csn_id_coded)) # 51846

In [42]:
head(cohort_feats, 1)
tail(cohort_feats, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label,feature_type,features,values,time
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>
1,JC1000116,131066472308,2015-01-28 00:46:00,0,0,0,0,0,0,demo,ESI_i,3,


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label,feature_type,features,values,time
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>
3867101,JC2553680,131280000000.0,2019-12-07 07:22:00,0,0,0,0,0,0,labs,O2sat_v,99.5,2019-12-07 07:03:00+00:00


In [43]:
# only have new cohort
write.csv(cohort_feats, file.path(outdir, "12_coh4_noOR_noDKA_feature_values.csv"), row.names=FALSE)