### Descriptions:

JOIN all features with values together first, in the *long* format, to be merged with feature counts

**Inputs:** 
- `6_5_coh2_vitals`: has all processed vital signs (not the summary stats)
- `6_6_coh3_labs`: has all processed labs
- `6_8_coh4_all_imputedHWESI`: has demographics & imputed HW (cohort4_all) & imputed ESI (cohort3 and finalized w/ cohort4)
- `6_4_cohort4`: cohort3 with labels and not in old cohort

**Outputs:**
- `6_9_coh4_feature_values`: used cohort4, only new cohort, no overlapping!

### Importing R libraries

In [16]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Call back all datasets: 
* demographic with imputed HW and ESI
* vitals, cohort with at least a complete set of vs only
* labs, cohort with at least a lab result
* cohort_final

In [17]:
# read inputs
datadir6 = "../../DataTD/validation"
valdir = "../../OutputTD/6_validation"

vitals0 <- read.csv(file.path(valdir, '6_5_coh2_vitals.csv'))
nrow(vitals0)

labs0 <- read.csv(file.path(valdir, '6_6_coh3_labs.csv'))
nrow(labs0)

demos <- read.csv(file.path(valdir, '6_8_coh4_all_imputedHWESI.csv'))
nrow(demos) # all cohort

# only new cohort
cohort4 <- read.csv(file.path(valdir, '6_7_cohort4.csv'))
nrow(cohort4) # 16484

In [18]:
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 1633
length(setdiff(cohort4$pat_enc_csn_id_coded, vitals0$pat_enc_csn_id_coded)) # 0

length(setdiff(vitals0$pat_enc_csn_id_coded, labs0$pat_enc_csn_id_coded)) # 3986
length(setdiff(cohort4$pat_enc_csn_id_coded, labs0$pat_enc_csn_id_coded)) # 2614

In [19]:
nrow(vitals0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(vitals0$pat_enc_csn_id_coded)) # 2nd cohort, larger 45613

nrow(demos %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(demos$pat_enc_csn_id_coded)) # 3nd cohort, 44258, using only those with a complete set of VS for ESI imputation

nrow(labs0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 41627
length(unique(labs0$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

nrow(cohort4 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 43980
length(unique(cohort4$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

length(setdiff(demos$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 278
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 1633
length(setdiff(labs0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 261

In [20]:
head(cohort4, 1)
head(demos, 1)
head(vitals0, 1)
head(labs0, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max,previous_icu_visit
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
1,JC1000116,131295313275,57868578,2020-09-29 22:45:00+00:00,0,0,0,1,0,0,0,0,0,0,0,0,0,False


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,1,165,0,81,0,0,0,0,0,0,0,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,template,features,units,recorded_time,feature_type,values
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,JC1000116,131295313275,57868578,2020-09-29 22:45:00+00:00,0,Custom Formula Data,Temp,,2020-09-29 15:55:00+00:00,vitals,36.6


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,base_name,ord_value,values,reference_low,reference_high,reference_unit,result_in_range_yn,result_flag,result_time,feature_type
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>
1,JC2013943,131288000000.0,53346407,2020-04-11 22:13:00+00:00,0,O2sat_a,O2SATA,98,98,95,98,%,,,2020-04-11 21:48:00+00:00,labs


In [21]:
colnames(cohort4)
colnames(demos)
colnames(vitals0)
colnames(labs0)

# cohort <- cohort %>% mutate(admit_time = ymd_hms(admit_time)) 

In [22]:
cohort <- cohort4 %>% select(anon_id, pat_enc_csn_id_coded, admit_time, first_label, death_24hr_recent_label, death_24hr_max_label)

In [23]:
# turn the wide format to long for demographics
demo_long <- gather(demos, features, values, ESI_i:race.White, factor_key=TRUE) %>%
                mutate(feature_type = "demo") %>% select(-admit_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>% 
             right_join(cohort)
                
nrow(demo_long) # --> 43980 * 17 = 747660

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [24]:
vitals <- vitals0 %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, admit_time, label, 
                            features, values, feature_type, time = recorded_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>%
                         right_join(cohort) #%>% drop_na()
labs <- labs0 %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, admit_time, label, 
                        features, values, feature_type, time = result_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>% 
                    right_join(cohort) %>% drop_na() # due to csn without labs in cohort4

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "admit_time")

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "admit_time")



In [25]:
length(unique(demo_long$pat_enc_csn_id_coded))
length(unique(vitals$pat_enc_csn_id_coded))
length(unique(labs$pat_enc_csn_id_coded)) # 41366 due to drop NA

head(demo_long, n=1)
head(vitals, n=1)
head(labs, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,label,features,values,feature_type,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<fct>,<dbl>,<chr>,<chr>,<int>,<int>,<int>
1,JC1000116,131295313275,57868578,0,ESI_i,3,demo,2020-09-29 22:45:00+00:00,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time,first_label,death_24hr_recent_label,death_24hr_max_label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>
1,JC1000116,131295313275,57868578,2020-09-29 22:45:00+00:00,0,Temp,36.6,vitals,2020-09-29 15:55:00+00:00,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time,first_label,death_24hr_recent_label,death_24hr_max_label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>
1,JC2013943,131288000000.0,53346407,2020-04-11 22:13:00+00:00,0,O2sat_a,98,labs,2020-04-11 21:48:00+00:00,0,0,0


In [26]:
# combine demos, vitals and labs, long format, with "time"
feats <- bind_rows(demo_long, vitals, labs)
feats <- as.data.frame(unclass(feats))

nrow(feats) #3034259
nrow(feats %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
nrow(feats %>% select(pat_enc_csn_id_coded) %>% distinct()) # 41627

feats %>% count(feature_type) # shouldn't have NA
summary(feats$values) 

feature_type,n
<chr>,<int>
demo,280228
labs,506845
vitals,388607


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -31.20    1.00   20.00   45.27   78.00 7000.00 

In [27]:
feats %>% group_by(feature_type, features) %>% count()

feature_type,features,n
<chr>,<chr>,<int>
demo,age,16484
demo,delta_ESI,16484
demo,delta_H,16484
demo,delta_W,16484
demo,English,16484
demo,ESI_i,16484
demo,gender,16484
demo,Height_i,16484
demo,insurance,16484
demo,race.Asian,16484


In [28]:
# rearrange columns
cohort4_feats <- feats %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                 first_label, death_24hr_recent_label, death_24hr_max_label,
                                 feature_type, features, values, time)
nrow(cohort4_feats) # 1175680 
length(unique(cohort4_feats$pat_enc_csn_id_coded)) # 16484

In [29]:
head(cohort4_feats, 3)
tail(cohort4_feats, 3)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,feature_type,features,values,time
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>
1,JC1000116,131295313275,2020-09-29 22:45:00+00:00,0,0,0,demo,ESI_i,3,
2,JC1000939,131295018112,2020-08-22 11:41:00+00:00,0,1,1,demo,ESI_i,2,
3,JC1001688,131288774622,2020-06-19 21:36:00+00:00,0,0,0,demo,ESI_i,3,


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,feature_type,features,values,time
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>
1175678,JC2701145,131311244469,2021-06-03 08:02:00+00:00,1,0,1,labs,O2sat_v,35.0,2021-06-02 23:52:00+00:00
1175679,JC543041,131314663470,2021-07-07 21:26:00+00:00,1,1,1,labs,O2sat_v,71.0,2021-07-07 20:01:00+00:00
1175680,JC1804671,131317693203,2021-08-19 11:24:00+00:00,0,0,0,labs,O2sat_v,58.2,2021-08-19 09:15:00+00:00


In [30]:
# only have new cohort
write.csv(cohort4_feats, file.path(valdir, "6_9_coh4_feature_values.csv"), row.names=FALSE)

In [35]:
featuredir = "../../OutputTD/2_features"
cohort4_0_feats <- read.csv(file.path(featuredir, "2_7_coh4_feature_values.csv"))
nrow(cohort4_0_feats) # 3085046
length(unique(cohort4_0_feats$pat_enc_csn_id_coded))

In [36]:
# NO overlapping! sanity check
length(setdiff(cohort4_feats$pat_enc_csn_id_coded, cohort4_0$pat_enc_csn_id_coded))
length(setdiff(cohort4_0_feats$pat_enc_csn_id_coded, cohort4_feats$pat_enc_csn_id_coded))