### Descriptions:
COHORT:
- Original cohort queried from BQ, ER patients admitted to inpatients, 2015 - 2020, `1_1_cohort`: 
- Cohort with some other criteria, age >= 18 and full code only, `1_2_cohort`: 
- Cohort after removing patients without a complet set of vital signs, `1_3_cohort`: 
- *NEW*: cohort with labs: 1_3_cohort to query labs, then retain only those with at least a lab result, `cohort3L_withlabs`
- Final cohort, with labels from Tiffany's, `1_4_cohort`: 

JOIN all features together first, then with the final cohort with labels:

Inputs: 
- Final cohort with labels from  Tiffany
- `2_3_coh2_vitals`: has all processed vital signs (not the summary stats)
- `2_4_coh3_labs`: has all processed labs
- `2_5_coh3_imputedHWESI`: has both demographics, imputed HW (with 1_2_cohort) and imputed ESI (with the latest cohort)

Outputs: 


### Importing R libraries

In [11]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Call back all datasets: 
* demographic with ESI
* vitals with GCS (note that this data has 43320 rows, but for ESI imputation, remove those with GCS as well, so we have 43291 left)
* labs, still with 9999999 values

In [62]:
# read inputs
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

cohort <- read.csv(file.path(cohortdir, 'cohort3L_withlabs.csv'))
nrow(cohort)

labs0 <- read.csv(file.path(featuredir, '2_4_coh3_labs.csv'))
nrow(labs0)

demos <- read.csv(file.path(featuredir, '2_5_coh3_imputedHWESI.csv'))
nrow(demos)

vitals0 <- read.csv(file.path(featuredir, '2_3_coh2_vitals.csv'))
nrow(vitals)


In [61]:
nrow(demos %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(demos$pat_enc_csn_id_coded)) # 3nd cohort, 44258

nrow(vitals0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
length(unique(vitals0$pat_enc_csn_id_coded)) # 2nd cohort, larger 45613

nrow(labs0 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 41627
length(unique(labs0$pat_enc_csn_id_coded)) # 41627 (similar before, labs have less than cohort)

length(setdiff(demos$pat_enc_csn_id_coded, cohort$pat_enc_csn_id_coded)) #2631 in A, not B, 0
length(setdiff(vitals0$pat_enc_csn_id_coded, cohort$pat_enc_csn_id_coded)) #3986
length(setdiff(labs0$pat_enc_csn_id_coded, cohort$pat_enc_csn_id_coded)) # 0


In [65]:
head(cohort, 1)
head(demos, 1)
head(vitals0, 1)
head(labs0, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,1,165,0,81,0,0,0,0,0,0,0,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,template,features,units,recorded_time,feature_type,values
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00+00:00,0,Custom Formula Data,Temp,,2019-08-31 10:14:00+00:00,vitals,36.9


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,base_name,ord_value,values,reference_low,reference_high,reference_unit,result_in_range_yn,result_flag,result_time,feature_type
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0,Lactate,LACWBL,2.2,2.2,,,mmol/L,,,2015-01-04 03:18:00+00:00,labs


In [73]:
colnames(cohort)
colnames(demos)
colnames(vitals)
colnames(labs)

cohort <- cohort %>% mutate(admit_time = ymd_hms(admit_time)) 

In [74]:
demo_long <- gather(demos, features, values, ESI_i:race.White, factor_key=TRUE) %>%
                mutate(feature_type = "demo") %>% mutate(admit_time = ymd_hms(admit_time)) %>% 
             right_join(cohort)
                
nrow(demo_long) # 44258*17 (cols with values) --> n_cohort * 17 = 707659

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "inpatient_data_id_coded", "admit_time", "label")



In [75]:
vitals <- vitals0 %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, admit_time, label, 
                            features, values, feature_type, time = recorded_time) %>% 
                        mutate(admit_time = ymd_hms(admit_time)) %>% right_join(cohort)
labs <- labs0 %>% select(anon_id, pat_enc_csn_id_coded, inpatient_data_id_coded, admit_time, label, 
                        features, values, feature_type, time = result_time) %>% 
                    mutate(admit_time = ymd_hms(admit_time)) %>% right_join(cohort)

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "inpatient_data_id_coded", "admit_time", "label")

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "inpatient_data_id_coded", "admit_time", "label")



In [82]:
length(unique(demo_long$pat_enc_csn_id_coded))
length(unique(vitals$pat_enc_csn_id_coded))
length(unique(labs$pat_enc_csn_id_coded))

head(demo_long, n=1)
head(vitals, n=1)
head(labs, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<dttm>,<int>,<fct>,<dbl>,<chr>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,ESI_i,3,demo


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<dttm>,<int>,<chr>,<dbl>,<chr>,<chr>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,Temp,36.9,vitals,2019-08-31 10:14:00+00:00


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<dttm>,<int>,<chr>,<dbl>,<chr>,<chr>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00,0,Lactate,2.2,labs,2015-01-04 03:18:00+00:00


In [89]:
# combine demos, vitals and labs, long format, with "time"
feats <- bind_rows(demo_long, vitals, labs)
feats <- as.data.frame(unclass(feats))

nrow(feats) #3034259
nrow(feats %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
nrow(feats %>% select(pat_enc_csn_id_coded) %>% distinct()) # 41627

feats %>% count(feature_type)

feature_type,n
<chr>,<int>
demo,707659
labs,1367422
vitals,959178


In [90]:
summary(feats$values)
feats %>% group_by(feature_type, features) %>% count()

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  -30.00     1.03    19.00    44.74    76.00 11900.00 

feature_type,features,n
<chr>,<chr>,<int>
demo,age,41627
demo,delta_ESI,41627
demo,delta_H,41627
demo,delta_W,41627
demo,English,41627
demo,ESI_i,41627
demo,gender,41627
demo,Height_i,41627
demo,insurance,41627
demo,race.Asian,41627


In [91]:
head(feats)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,features,values,feature_type,time
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<dttm>,<int>,<chr>,<dbl>,<chr>,<chr>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,ESI_i,3,demo,
2,JC29f8ad3,131278291027,42992239,2019-10-05 23:48:00,0,ESI_i,3,demo,
3,JC29f8b9c,131266787806,36261582,2019-05-05 01:07:00,0,ESI_i,2,demo,
4,JC29f8beb,131264387263,34626013,2019-03-15 03:35:00,0,ESI_i,3,demo,
5,JC29f8beb,131279241689,43527040,2019-11-27 15:29:00,0,ESI_i,3,demo,
6,JC29f8bef,131280937356,44544574,2019-11-30 10:35:00,0,ESI_i,3,demo,


In [92]:
write.csv(feats, file.path(featuredir, "2_7_cohort3L_features.csv"), row.names=FALSE)

In [82]:
write.csv(cohort_feats, file.path(featuredir, "cohort_features.csv"), row.names=FALSE)