### Descriptions:

JOIN all features with values together first, in the *long* format, to be merged with feature counts

Inputs: 
- `2_3_coh2_vitals`: has all processed vital signs (not the summary stats)
- `2_4_coh3_labs`: has all processed labs
- `2_5_coh3_imputedHWESI`: has both demographics, imputed HW (with 1_2_cohort) and imputed ESI (with the latest cohort)
- `1_5_cohort_final`: 1_3_cohort with labels processed and have labs
- `1_4_cohort`: 1_3_cohort with labels, a superset of 1_5_cohort_final

Outputs:
- `2_7_coh5_feature_values`: used 1_5_cohort_final
- `2_7_coh4_feature_values`: used 1_4_cohort, a superset of `2_7_coh5_feature_values`

### Importing R libraries

In [25]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Call back all datasets: 
* demographic with imputed HW and ESI
* vitals, cohort with at least a complete set of vs only
* labs, cohort with at least a lab result
* cohort_final

In [26]:
# read inputs
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"


vitals0 <- read.csv(file.path(featuredir, '2_3_coh2_vitals.csv'))
nrow(vitals0)

labs0 <- read.csv(file.path(featuredir, '2_4_coh3_labs.csv'))
nrow(labs0)

demos <- read.csv(file.path(featuredir, '2_5_coh3_imputedHWESI.csv'))
nrow(demos)

cohort4 <- read.csv(file.path(cohortdir, '1_4_cohort.csv'))
nrow(cohort4)

cohort5 <- read.csv(file.path(cohortdir, '1_5_cohort_final.csv'))
nrow(cohort5)

In [3]:
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 1633
length(setdiff(cohort4$pat_enc_csn_id_coded, vitals0$pat_enc_csn_id_coded)) # 0

length(setdiff(vitals0$pat_enc_csn_id_coded, labs0$pat_enc_csn_id_coded)) # 3986
length(setdiff(cohort4$pat_enc_csn_id_coded, labs0$pat_enc_csn_id_coded)) # 2614
length(setdiff(cohort4$pat_enc_csn_id_coded, cohort5$pat_enc_csn_id_coded)) # 2614
length(setdiff(cohort5$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 2614

In [4]:
nrow(vitals0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(vitals0$pat_enc_csn_id_coded)) # 2nd cohort, larger 45613

nrow(demos %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(demos$pat_enc_csn_id_coded)) # 3nd cohort, 44258, using only those with a complete set of VS for ESI imputation

nrow(labs0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 41627
length(unique(labs0$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

nrow(cohort4 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 43980
length(unique(cohort4$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

nrow(cohort5 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 41366
length(unique(cohort5$pat_enc_csn_id_coded)) # 43980

length(setdiff(demos$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 278
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 1633
length(setdiff(labs0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) # 261

length(setdiff(demos$pat_enc_csn_id_coded, cohort5$pat_enc_csn_id_coded)) # 2892
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort5$pat_enc_csn_id_coded)) # 4247
length(setdiff(labs0$pat_enc_csn_id_coded, cohort5$pat_enc_csn_id_coded)) # 261

In [5]:
head(cohort4, 1)
head(cohort5, 1)
head(demos, 1)
head(vitals0, 1)
head(labs0, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>
1,JCd97296,131176000000.0,18290644,2016-02-06 22:31:00+00:00,0,0,,0,0,0,0,0,1325,0,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0,0,0,1,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,1,165,0,81,0,0,0,0,0,0,0,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,template,features,units,recorded_time,feature_type,values
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00+00:00,0,Custom Formula Data,Temp,,2019-08-31 10:14:00+00:00,vitals,36.9


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,base_name,ord_value,values,reference_low,reference_high,reference_unit,result_in_range_yn,result_flag,result_time,feature_type
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0,Lactate,LACWBL,2.2,2.2,,,mmol/L,,,2015-01-04 03:18:00+00:00,labs


In [6]:
colnames(cohort4)
colnames(cohort5)
colnames(demos)
colnames(vitals0)
colnames(labs0)

# cohort <- cohort %>% mutate(admit_time = ymd_hms(admit_time)) 

In [7]:
cohort <- cohort4 %>% select(anon_id, pat_enc_csn_id_coded, admit_time, first_label, death_24hr_recent_label, death_24hr_max_label)

In [8]:
# turn the wide format to long for demographics
demo_long <- gather(demos, features, values, ESI_i:race.White, factor_key=TRUE) %>%
                mutate(feature_type = "demo") %>% select(-admit_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>% 
             right_join(cohort)
                
nrow(demo_long) # --> 43980 * 17 = 747660

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [15]:
vitals <- vitals0 %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, admit_time, label, 
                            features, values, feature_type, time = recorded_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>%
                         right_join(cohort) #%>% drop_na()
labs <- labs0 %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, admit_time, label, 
                        features, values, feature_type, time = result_time) %>% #mutate(admit_time = ymd_hms(admit_time)) %>% 
                    right_join(cohort) %>% drop_na() # due to csn without labs in cohort4

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "admit_time")

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "admit_time")



In [17]:
length(unique(demo_long$pat_enc_csn_id_coded))
length(unique(vitals$pat_enc_csn_id_coded))
length(unique(labs$pat_enc_csn_id_coded)) # 41366 due to drop NA

head(demo_long, n=1)
head(vitals, n=1)
head(labs, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,label,features,values,feature_type,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<fct>,<dbl>,<chr>,<chr>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,0,ESI_i,3,demo,2019-08-31 12:52:00+00:00,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time,first_label,death_24hr_recent_label,death_24hr_max_label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00+00:00,0,Temp,36.9,vitals,2019-08-31 10:14:00+00:00,0,0,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time,first_label,death_24hr_recent_label,death_24hr_max_label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0,Lactate,2.2,labs,2015-01-04 03:18:00+00:00,0,0,0


In [18]:
# combine demos, vitals and labs, long format, with "time"
feats <- bind_rows(demo_long, vitals, labs)
feats <- as.data.frame(unclass(feats))

nrow(feats) #3034259
nrow(feats %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
nrow(feats %>% select(pat_enc_csn_id_coded) %>% distinct()) # 41627

feats %>% count(feature_type) # shouldn't have NA
summary(feats$values) 

feature_type,n
<chr>,<int>
demo,747660
labs,1358669
vitals,978717


    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  -30.00     1.00    19.00    44.57    75.00 11900.00 

In [19]:
feats %>% group_by(feature_type, features) %>% count()

feature_type,features,n
<chr>,<chr>,<int>
demo,age,43980
demo,delta_ESI,43980
demo,delta_H,43980
demo,delta_W,43980
demo,English,43980
demo,ESI_i,43980
demo,gender,43980
demo,Height_i,43980
demo,insurance,43980
demo,race.Asian,43980


In [20]:
# rearrange columns
cohort4_feats <- feats %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                 first_label, death_24hr_recent_label, death_24hr_max_label,
                                 feature_type, features, values, time)
nrow(cohort4_feats) #3085046 
length(unique(cohort4_feats$pat_enc_csn_id_coded)) # 43980

In [21]:
head(cohort4_feats, 3)
tail(cohort4_feats, 3)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,feature_type,features,values,time
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,0,0,0,demo,ESI_i,3,
2,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,0,0,0,demo,ESI_i,3,
3,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,0,0,0,demo,ESI_i,2,


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,feature_type,features,values,time
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>
3085044,JCd473c6,131281585175,2019-12-10 23:52:00+00:00,1,0,1,labs,O2sat_v,89.6,2019-12-10 22:38:00+00:00
3085045,JCd3262e,131275001383,2019-08-15 16:54:00+00:00,1,0,1,labs,O2sat_v,89.0,2019-08-15 14:56:00+00:00
3085046,JCeb2276,131282270628,2020-02-14 04:58:00+00:00,0,0,0,labs,O2sat_v,36.3,2020-02-14 00:51:00+00:00


In [22]:
write.csv(cohort4_feats, file.path(featuredir, "2_7_coh4_feature_values.csv"), row.names=FALSE)

In [23]:
# cohort5 is a subset of cohort4 excluding observations with no labs
cohort5_feats <- cohort4_feats %>% filter(pat_enc_csn_id_coded %in% cohort5$pat_enc_csn_id_coded)
nrow(cohort5_feats) #3012942
length(unique(cohort5_feats$pat_enc_csn_id_coded)) #41366

In [24]:
write.csv(cohort5_feats, file.path(featuredir, "2_7_coh5_feature_values.csv"), row.names=FALSE)

In [27]:
df <- read.csv(file.path(featuredir, "2_7_coh5_feature_values.csv"))
length(unique(df$pat_enc_csn_id_coded))