### Description:
- Read in the decile-binned values of labs and vitals `2_8_binned_labs_vitals`
- Read in the data `order_code_counts` from sql order counts, combine Microbiology Culture with Microbiology 
- Combine this featurized data for full complex data modeling
- Create a smaller random sample for testing 

Inputs: 
- `2_8_binned_labs_vitals`: from feature_values --> binned into deciles, which were formed using the training set (2015 - 2017)
- `order_code_counts`: from SQL querried counts for different orders

Outputs:
- `2_9_features_all_long`: have demo, vitals and labs in binned counts, and order counts, long format

In [1]:
library(data.table)
library(tidyverse)
library(lubridate)
# library(Matrix)
# library(slam)
# library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

In [1]:
# install.packages(c("dbplyr", "RSQLite"))

Installing packages into ‘/home/jupyter/.R/library’
(as ‘lib’ is unspecified)

also installing the dependency ‘plogr’




In [2]:
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

In [58]:
cohort <- read.csv(file.path(cohortdir,  '1_5_cohort_final.csv'))
nrow(cohort)

nrow(cohort %>% filter(year(admit_time) < 2018)) # 20762
nrow(cohort %>% filter(year(admit_time) == 2018)) # 8929
nrow(cohort %>% filter(year(admit_time)> 2018)) # 11675

In [3]:
orders <- read.csv(file.path(datadir, 'order_code_counts.csv'))
nrow(orders) #5730678

In [4]:
head(orders)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I95.9,1
2,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,F41.1,1
3,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I12.9,1
4,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I99.8,1
5,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Lab,"MAGNESIUM, SERUM/PLASMA",1
6,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,T45.515A,1


In [5]:
orders %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2680945
Meds,1747295
Lab,932873
Imaging,282646
Procedures,48863
Microbiology Culture,36447
Microbiology,1609


In [6]:
orders_micro <- orders %>% mutate(feature_type = ifelse(str_detect(feature_type, "Microbiology"), "Microbiology", feature_type))
orders_micro %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2680945
Meds,1747295
Lab,932873
Imaging,282646
Procedures,48863
Microbiology,38056


In [7]:
length(unique(orders_micro$pat_enc_csn_id_coded))

### Check the featurized binned labs_vitals and combine all features

In [16]:
# check all features with values
values <- read.csv(file.path(featuredir, '2_7_feature_values.csv'))
nrow(values) # 3012942
length(unique(values$pat_enc_csn_id_coded))
values %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
demo,703222
labs,1358669
vitals,951051


In [56]:
# check train set
train <- values %>% filter(year(admit_time) %in% c(2015, 2016, 2017))
nrow(train %>% filter(feature_type != 'demo')) # 1240798 (check against 2.8_featurize_vitals_labs)

In [15]:
# read in binned labs and vitals
binned_labs_vitals <- read.csv(file.path(featuredir, '2_8_binned_labs_vitals.csv'))
nrow(binned_labs_vitals) # 1825075
length(unique(binned_labs_vitals$pat_enc_csn_id_coded))
binned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_train,1304104
vitals_train,520971


In [9]:
head(binned_labs_vitals)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALK_7,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALT_0,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AST_1,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,BUN_8,1


In [71]:
# get demographics features
demos <- values %>% filter(feature_type == 'demo') %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                                              feature_type, features, values)
nrow(demos)
length(unique(demos$pat_enc_csn_id_coded))

In [21]:
# check overlapping cohort in features with values vs with order counts: should be the same, no diff
setdiff(binned_labs_vitals$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded)
setdiff(orders_micro$pat_enc_csn_id_coded, binned_labs_vitals$pat_enc_csn_id_coded)

In [79]:
# combine all features for the complex data set, remove time column since demo doesn't have it
features_all <- bind_rows(demos, binned_labs_vitals, orders_micro)
nrow(features_all) # 7555753
length(unique(features_all$pat_enc_csn_id_coded))
length(unique(features_all$features)) #42201, actually 42234 is the correct number of features (as some have 2 feature_types)
features_all %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2680945
Meds,1747295
labs_results_train,1304104
Lab,932873
demo,703222
vitals_train,520971
Imaging,282646
Procedures,48863
Microbiology,38056


In [90]:
head(features_all)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3
2,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3
3,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2
4,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3
5,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3
6,JC29f8bef,131280937356,2019-11-30 10:35:00+00:00,demo,ESI_i,3


In [81]:
n2 <- features_all %>% select(feature_type, features) %>% distinct() %>% 
        group_by(features) %>% count(sort=TRUE) %>% filter(n==2)
n2

features,n
<chr>,<int>
ARTERIAL BLOOD GAS,2
ASPERGILLUS GALACTOMANNAN,2
BETA STREP THROAT SCREEN RAPID EIA,2
BORDETELLA PERTUSSIS PCR,2
CRYPTOSPORIDIUM EIA,2
"CULTURE, THROAT",2
HELICOBACTER PYLORI AB,2
"HELICOBACTER PYLORI AG, STOOL",2
Limited Ultrasound- Abdominal Aorta (Limited Retroperitoneal),2
LIMITED ULTRASOUND- ABDOMINAL AORTA (LIMITED RETROPERITONEAL),2


In [82]:
features_all %>% filter(features %in% n2$features) %>% group_by(features, feature_type) %>% count()

features,feature_type,n
<chr>,<chr>,<int>
ARTERIAL BLOOD GAS,Lab,5348
ARTERIAL BLOOD GAS,Procedures,53
ASPERGILLUS GALACTOMANNAN,Lab,73
ASPERGILLUS GALACTOMANNAN,Microbiology,18
BETA STREP THROAT SCREEN RAPID EIA,Lab,355
BETA STREP THROAT SCREEN RAPID EIA,Microbiology,3
BORDETELLA PERTUSSIS PCR,Lab,41
BORDETELLA PERTUSSIS PCR,Microbiology,27
CRYPTOSPORIDIUM EIA,Lab,1
CRYPTOSPORIDIUM EIA,Microbiology,6


In [83]:
features_all %>% group_by(feature_type, features) %>% count(sort=TRUE)

feature_type,features,n
<chr>,<chr>,<int>
demo,age,41366
demo,delta_ESI,41366
demo,delta_H,41366
demo,delta_W,41366
demo,English,41366
demo,ESI_i,41366
demo,gender,41366
demo,Height_i,41366
demo,insurance,41366
demo,race.Asian,41366


In [84]:
head(features_all)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3
2,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3
3,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2
4,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3
5,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3
6,JC29f8bef,131280937356,2019-11-30 10:35:00+00:00,demo,ESI_i,3


In [92]:
nrow(features_all)
summary(features_all$values)
# sum(is.na(features_all$first_label))

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   0.000    1.000    1.000    3.721    2.000 1078.000 

In [93]:
write.csv(features_all, file = file.path(featuredir, "2_9_features_all_long.csv"), row.names=FALSE) 

### TEST

In [106]:
yr <- read.csv(file.path(featuredir, '2_9_features_all_long_withyear.csv'))
yr0 <- yr %>% select(-year)
nrow(yr0)
nrow(features_all)
identical(yr0, features_all)
all.equal(yr0,features_all)

In [108]:
yr0 <- yr0 %>% arrange(anon_id, pat_enc_csn_id_coded, admit_time, feature_type, features, values)
ft <- features_all %>% arrange(anon_id, pat_enc_csn_id_coded, admit_time, feature_type, features, values)

In [109]:
identical(yr0, ft)
all.equal(yr0, ft)

In [100]:
joined <- inner_join(yr, features_all)

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "admit_time", "feature_type", "features", "values")



In [101]:
summary(joined)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values              year     
 Length:8258975     Min.   :   0.000   Min.   :2015  
 Class :character   1st Qu.:   1.000   1st Qu.:2016  
 Mode  :character   Median :   1.000   Median :2017  
                    Mean   :   3.721   Mean   :2017  
                    3rd Qu.:   2.000   3rd Qu.:2019  
                    Max.   :1078.000   Max.   :2020  

In [104]:
summary(yr)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values              year     
 Length:8258975     Min.   :   0.000   Min.   :2015  
 Class :character   1st Qu.:   1.000   1st Qu.:2016  
 Mode  :character   Median :   1.000   Median :2017  
                    Mean   :   3.721   Mean   :2017  
                    3rd Qu.:   2.000   3rd Qu.:2019  
                    Max.   :1078.000   Max.   :2020  

In [105]:
summary(features_all)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values        
 Length:8258975     Min.   :   0.000  
 Class :character   1st Qu.:   1.000  
 Mode  :character   Median :   1.000  
                    Mean   :   3.721  
                    3rd Qu.:   2.000  
                    Max.   :1078.000  