## Descriptions:
Combine all features with vital summary and labels to form the simple dataset

**Inputs**:  
- `2_3_coh3_vital_stats` contains all vital related features
- `2_4_coh3_imputedHWESI` contains all demo, imputed HW and ESI
- `1_4_` -- cohort with labels from Tiffany

**Outputs**: 
- `2_4_coh3_simpledata`
- `2_4_coh_simpledata`

### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




### Get the dataset for simple models:

In [10]:
# read inputs
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

vital_stats <- read.csv(file.path(featuredir, '2_3_coh3_vital_stats.csv'))
nrow(vital_stats) #44258

cohort_imp <- read.csv(file.path(featuredir, '2_5_coh3_imputedHWESI.csv'))
nrow(cohort_imp) #44258

# cohort_labels <- read.csv(file.path(featuredir, '1_4_.csv'))
# nrow(cohort_labels)

In [3]:
sum(!vital_stats$pat_enc_csn_id_coded %in% cohort_imp$pat_enc_csn_id_coded)
head(vital_stats, n=1)
head(cohort_imp, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,DBP_count,DBP_first_val,DBP_fldiff,DBP_IQRx,DBP_last_val,DBP_madx,DBP_maxx,DBP_meanx,DBP_medianx,DBP_minx,⋯,SBP_minx,SBP_mmdiff,SBP_sdx,Temp_count,Temp_first_val,Temp_fldiff,Temp_IQRx,Temp_last_val,Temp_madx,Temp_maxx,Temp_meanx,Temp_medianx,Temp_minx,Temp_mmdiff,Temp_sdx
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,⋯,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00+00:00,0,2,74,-12,6,62,8.8956,74,68,68,62,⋯,118,6,4.242641,2,36.95,0,0.05,36.95,0.07413,37,36.95,36.95,36.9,0.1,0.07071068


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,race.Asian,race.Black,race.Native.American,race.Other,race.Pacific.Islander,race.Unknown,race.White
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,1,165,0,81,0,0,0,0,0,0,0,1


In [4]:
vital_stats <- vital_stats %>% mutate(admit_time = ymd_hms(admit_time))
cohort_imp <- cohort_imp %>% mutate(admit_time = ymd_hms(admit_time))

data_simple <- inner_join(cohort_imp, vital_stats) # cohort_imp %>% select(-admit_time) %>% inner_join(vital_stats)
dim(data_simple)
nrow(data_simple %>% select(pat_enc_csn_id_coded) %>% distinct())
colnames(data_simple) #44258

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "inpatient_data_id_coded", "admit_time", "label")



In [5]:
head(data_simple, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label,ESI_i,delta_ESI,gender,age,insurance,English,Height_i,delta_H,Weight_i,delta_W,⋯,SBP_minx,SBP_mmdiff,SBP_sdx,Temp_count,Temp_first_val,Temp_fldiff,Temp_IQRx,Temp_last_val,Temp_madx,Temp_maxx,Temp_meanx,Temp_medianx,Temp_minx,Temp_mmdiff,Temp_sdx
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<dttm>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,⋯,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,JC29f8ad2,131274729058,40679773,2019-08-31 12:52:00,0,3,0,1,52,1,1,165,0,81,0,⋯,118,6,4.242641,2,36.95,0,0.05,36.95,0.07413,37,36.95,36.95,36.9,0.1,0.07071068


In [6]:
# write.csv(cohort_final, file.path(datadir, 'cohort_final.csv'), row.names = FALSE)
write.csv(data_simple, file.path(featuredir, "2_4_coh3_simpledata.csv"), row.names=FALSE)

### Final cohort with simple data

In [7]:
summary(data_simple)

   anon_id          pat_enc_csn_id_coded inpatient_data_id_coded
 Length:44258       Min.   :1.311e+11    Min.   :13616753       
 Class :character   1st Qu.:1.312e+11    1st Qu.:19695277       
 Mode  :character   Median :1.312e+11    Median :27823880       
                    Mean   :1.312e+11    Mean   :28054495       
                    3rd Qu.:1.313e+11    3rd Qu.:34349734       
                    Max.   :1.313e+11    Max.   :48026130       
   admit_time                      label            ESI_i      
 Min.   :2015-01-01 08:24:00   Min.   :0.0000   Min.   :1.000  
 1st Qu.:2016-05-11 00:53:00   1st Qu.:0.0000   1st Qu.:2.000  
 Median :2017-12-23 00:30:30   Median :0.0000   Median :3.000  
 Mean   :2017-10-07 06:16:09   Mean   :0.1345   Mean   :2.659  
 3rd Qu.:2019-02-15 04:37:00   3rd Qu.:0.0000   3rd Qu.:3.000  
 Max.   :2020-03-31 07:00:00   Max.   :1.0000   Max.   :5.000  
   delta_ESI           gender            age          insurance     
 Min.   :0.00000   Min.   :0