## Descriptions:
Combine all features with vital summary and labels to create the simple dataset

**Inputs**:  
- `2_3_coh3_vital_stats` contains all vital related features
- `2_4_coh3_imputedHWESI` contains all demo, imputed HW and ESI
- merge with `1_5_cohort_final` -- final cohort with labels (and labs) to be consistent with the complex data

**Outputs**: 
- `2_6_coh3_simpledata`: in *wide* format

### Importing R libraries

In [17]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Get the dataset for simple models:

In [3]:
# read inputs
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

vital_stats <- read.csv(file.path(featuredir, '2_3_coh3_vital_stats.csv'))
nrow(vital_stats) #44258

cohort_imp <- read.csv(file.path(featuredir, '2_5_coh3_imputedHWESI.csv'))
nrow(cohort_imp) #44258

cohort <- read.csv(file.path(cohortdir, '1_5_cohort_final.csv')) #41366
nrow(cohort)

In [7]:
sum(!vital_stats$pat_enc_csn_id_coded %in% cohort_imp$pat_enc_csn_id_coded)
length(setdiff(cohort$pat_enc_csn_id_coded, cohort_imp$pat_enc_csn_id_coded))
length(setdiff(cohort$pat_enc_csn_id_coded, vital_stats$pat_enc_csn_id_coded))

head(vital_stats, n=1)
head(cohort_imp, n=1)
head(cohort, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,DBP_count,DBP_first_val,DBP_fldiff,DBP_IQRx,DBP_last_val,DBP_madx,DBP_maxx,DBP_meanx,DBP_medianx,DBP_minx,...,SBP_minx,SBP_mmdiff,SBP_sdx,Temp_count,Temp_first_val,Temp_fldiff,Temp_IQRx,Temp_last_val,Temp_madx,Temp_maxx,Temp_meanx,Temp_medianx,Temp_minx,Temp_mmdiff,Temp_sdx
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,...,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00+00:00,0,2,74,-12,6,62,8.8956,74,68,68,62,...,118,6,4.242641,2,36.95,0,0.05,36.95,0.07413,37,36.95,36.95,36.9,0.1,0.07071068


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,1,165,0,81,0,0,0,0,0,0,0,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_time_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>
1,JCe33305,131063880385,13777312,2015-01-04 08:11:00+00:00,0,0,0,1,0,0,0,0,0,0,0,0,0


In [None]:
vital_stats <- vital_stats %>% #mutate(admit_time = ymd_hms(admit_time), label_max24 = label) %>%
                select(-inpatient_data_id_coded, -admit_time, -label)
cohort_imp <- cohort_imp %>% #mutate(admit_time = ymd_hms(admit_time), label_max24 = label) %>%
                 select(-inpatient_data_id_coded, -admit_time, -label)
# cohort <- cohort %>% mutate(admit_time = ymd_hms(admit_time))

In [11]:
data_simple <- cohort %>% select(anon_id, pat_enc_csn_id_coded, first_label, death_24hr_recent_label) %>%
                left_join(cohort_imp) %>% left_join(vital_stats) # left_join both, otherwise if full_join, it will retain all rows in vitals
dim(data_simple)
nrow(data_simple %>% select(pat_enc_csn_id_coded) %>% distinct())
colnames(data_simple) #41366

Joining, by = c("anon_id", "pat_enc_csn_id_coded")

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [14]:
summary(data_simple)

   anon_id          pat_enc_csn_id_coded  first_label    
 Length:41366       Min.   :1.311e+11    Min.   :0.0000  
 Class :character   1st Qu.:1.312e+11    1st Qu.:0.0000  
 Mode  :character   Median :1.312e+11    Median :0.0000  
                    Mean   :1.312e+11    Mean   :0.1135  
                    3rd Qu.:1.313e+11    3rd Qu.:0.0000  
                    Max.   :1.313e+11    Max.   :1.0000  
 death_24hr_recent_label     ESI_i         delta_ESI           gender      
 Min.   :0.00000         Min.   :1.000   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:0.00000         1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000  
 Median :0.00000         Median :3.000   Median :0.00000   Median :0.0000  
 Mean   :0.09771         Mean   :2.649   Mean   :0.03936   Mean   :0.4768  
 3rd Qu.:0.00000         3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000  
 Max.   :1.00000         Max.   :5.000   Max.   :1.00000   Max.   :1.0000  
      age          insurance         English          Height_i

In [16]:
# write.csv(cohort_final, file.path(datadir, 'cohort_final.csv'), row.names = FALSE)
write.csv(data_simple, file.path(featuredir, "2_6_simpledata.csv"), row.names=FALSE)