### Description:
- Read in the decile-binned values of labs and vitals `2_8_binned_labs_vitals`
- Read in the data `order_code_counts` from sql order counts, combine Microbiology Culture with Microbiology 
- Combine this featurized data for full complex data modeling
- Create a smaller random sample for testing 

Inputs: 
- `2_8_binned_labs_vitals`: from feature_values --> binned into deciles
    - `..._train`: used the training set (2015 - 2017) for value distributions
    - `..._test`: used the training and validation set (2015 - 2018) for value distributions
- `order_code_counts`: used 1_5_cohort_final, from SQL querried counts for different orders
- `coh4_order_code_counts`: used 1_4_cohort

Outputs: have demo, vitals and labs in binned counts, and order counts, long format
- `2_9_features_all_long_year_train`: (used 1_5_cohort_final)
    - use 2015 - 2017 as trained data for value distributions to bin validation data of 2018
    - this is for the purpose of train data and select model hyper parameters on validation data
    - test data > 2018 (2019 and some 2020) left unused
- `2_9_features_all_long_year`: (used 1_5_cohort_final)
    - after model hyper parameters were already selected, the value distributions are done again on data 2015 - 2018
    - test data > 2018 (2019 and some 2020) are binned on based on these distributions
    - binned test data is used for the final prediction and evaluation of model performance
- `..._coh4_...`: used 1_4_cohort

In [1]:
library(data.table)
library(tidyverse)
library(lubridate)
# library(Matrix)
# library(slam)
# library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

In [1]:
# install.packages(c("dbplyr", "RSQLite"))

Installing packages into ‘/home/jupyter/.R/library’
(as ‘lib’ is unspecified)

also installing the dependency ‘plogr’




In [2]:
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

In [21]:
# cohort <- read.csv(file.path(cohortdir,  '1_5_cohort_final.csv'))
cohort <- read.csv(file.path(cohortdir,  '1_4_cohort.csv'))
nrow(cohort)

nrow(cohort %>% filter(year(admit_time) < 2018)) # 20762
nrow(cohort %>% filter(year(admit_time) == 2018)) # 8929
nrow(cohort %>% filter(year(admit_time)> 2018)) # 11675

nrow(cohort %>% filter(year(admit_time) < 2019)) # 29691

In [22]:
orders <- read.csv(file.path(datadir, 'coh4_order_code_counts.csv'))
nrow(orders) #6086852

In [23]:
head(orders)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,R19.7,1
2,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,R40.4,1
3,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,Z23,1
4,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Meds,"EPINEPHRINE 1 MG/ML (1:1,000) (1ML) INJ SOLN",1
5,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,J81.0,1
6,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Meds,CEFTRIAXONE 1 GRAM/50 ML MINI-BAG PLUS,1


In [24]:
orders %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2834516
Meds,1868195
Lab,993484
Imaging,299944
Procedures,51696
Microbiology Culture,37374
Microbiology,1643


In [25]:
orders_micro <- orders %>% mutate(feature_type = ifelse(str_detect(feature_type, "Microbiology"), "Microbiology", feature_type))
orders_micro %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2834516
Meds,1868195
Lab,993484
Imaging,299944
Procedures,51696
Microbiology,39017


In [26]:
length(unique(orders_micro$pat_enc_csn_id_coded)) # 43970 there are 10 people who had no orders

In [31]:
setdiff(orders_micro$pat_enc_csn_id_coded, cohort$pat_enc_csn_id_coded) # 0 
setdiff(cohort$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded) # 10

### Check the featurized binned labs_vitals and combine features
- Separate the trainbin and testbin for modeling

In [3]:
# check all features with values
values <- read.csv(file.path(featuredir, '2_7_coh4_feature_values.csv'))
nrow(values) # 3012942
length(unique(values$pat_enc_csn_id_coded))
values %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
demo,747660
labs,1358669
vitals,978717
,2614


In [33]:
# check train set
trainbin <- values %>% filter(year(admit_time) %in% c(2015, 2016, 2017))
testbin <- values %>% filter(year(admit_time) %in% c(2015, 2016, 2017, 2018))

nrow(trainbin %>% filter(feature_type != 'demo')) #1240798 (check against 2.8_featurize_vitals_labs)
nrow(testbin %>% filter(feature_type != 'demo')) #1694667 (check against 2.8_featurize_vitals_labs)

In [34]:
# read in train_binned labs and vitals
trainbinned_labs_vitals <- read.csv(file.path(featuredir, '2_8_coh4_binned_labs_vitals_train.csv'))
nrow(trainbinned_labs_vitals) # 1825075
length(unique(trainbinned_labs_vitals$pat_enc_csn_id_coded))
trainbinned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_train,1304104
vitals_train,541940


In [35]:
# read in test_binned labs and vitals
testbinned_labs_vitals <- read.csv(file.path(featuredir, '2_8_coh4_binned_labs_vitals_test.csv'))
nrow(testbinned_labs_vitals) # 1827530
length(unique(testbinned_labs_vitals$pat_enc_csn_id_coded))
testbinned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_test,1304320
vitals_test,545118


In [36]:
head(trainbinned_labs_vitals)
head(testbinned_labs_vitals)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALK_7,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALT_0,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AST_1,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,BUN_8,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_test,ALB_3,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_test,ALK_7,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_test,ALT_0,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_test,AST_1,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_test,AnionGap_9,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_test,BUN_8,1


In [37]:
# get demographics features
demos <- values %>% filter(feature_type == 'demo') %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                                              feature_type, features, values)
nrow(demos)
length(unique(demos$pat_enc_csn_id_coded))

In [39]:
# check overlapping cohort in features with values vs with order counts: should be the same, no diff
length(setdiff(trainbinned_labs_vitals$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded))
length(setdiff(orders_micro$pat_enc_csn_id_coded, trainbinned_labs_vitals$pat_enc_csn_id_coded))

length(setdiff(testbinned_labs_vitals$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded))
length(setdiff(orders_micro$pat_enc_csn_id_coded, testbinned_labs_vitals$pat_enc_csn_id_coded))

In [40]:
# combine all features for the complex data set, remove time column since demo doesn't have it
features_all <- bind_rows(demos, trainbinned_labs_vitals, testbinned_labs_vitals, orders_micro)
nrow(features_all) # 10086505 (this include both train and test)
length(unique(features_all$pat_enc_csn_id_coded))
length(unique(features_all$features)) #42201 vs 42207 (train vs testbin), actually 42234 is the correct number of features (as some have 2 feature_types)
features_all %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2834516
Meds,1868195
labs_results_test,1304320
labs_results_train,1304104
Lab,993484
demo,747660
vitals_test,545118
vitals_train,541940
Imaging,299944
Procedures,51696


In [41]:
head(features_all)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3
2,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3
3,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2
4,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3
5,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3
6,JC29f8bef,131280937356,2019-11-30 10:35:00+00:00,demo,ESI_i,3


In [30]:
n2 <- features_all %>% select(feature_type, features) %>% distinct() %>% 
        group_by(features) %>% count(sort=TRUE) %>% filter(n==2)
head(n2)
tail(n2)

features,n
<chr>,<int>
ALB_0,2
ALB_1,2
ALB_10,2
ALB_2,2
ALB_3,2
ALB_4,2


features,n
<chr>,<int>
WBC_4,2
WBC_5,2
WBC_6,2
WBC_7,2
WBC_8,2
WBC_9,2


In [29]:
features_all %>% filter(features %in% n2$features) %>% group_by(features, feature_type) %>% count()

features,feature_type,n
<chr>,<chr>,<int>
ALB_0,labs_results_test,2717
ALB_0,labs_results_train,2717
ALB_1,labs_results_test,3218
ALB_1,labs_results_train,3218
ALB_10,labs_results_test,1
ALB_10,labs_results_train,2
ALB_2,labs_results_test,3593
ALB_2,labs_results_train,2229
ALB_3,labs_results_test,3226
ALB_3,labs_results_train,4585


In [31]:
features_all %>% group_by(feature_type, features) %>% count(sort=TRUE)

feature_type,features,n
<chr>,<chr>,<int>
demo,age,41366
demo,delta_ESI,41366
demo,delta_H,41366
demo,delta_W,41366
demo,English,41366
demo,ESI_i,41366
demo,gender,41366
demo,Height_i,41366
demo,insurance,41366
demo,race.Asian,41366


In [42]:
head(features_all)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3
2,JC29f8ad3,131278291027,2019-10-05 23:48:00+00:00,demo,ESI_i,3
3,JC29f8b9c,131266787806,2019-05-05 01:07:00+00:00,demo,ESI_i,2
4,JC29f8beb,131264387263,2019-03-15 03:35:00+00:00,demo,ESI_i,3
5,JC29f8beb,131279241689,2019-11-27 15:29:00+00:00,demo,ESI_i,3
6,JC29f8bef,131280937356,2019-11-30 10:35:00+00:00,demo,ESI_i,3


In [43]:
nrow(features_all)
summary(features_all$values)
# sum(is.na(features_all$first_label))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    1.00    1.00    3.33    2.00 1078.00 

In [44]:
features_all_year <- features_all %>% mutate(year = year(admit_time))

In [45]:
features_all_year %>% group_by(year) %>% count()

year,n
<dbl>,<int>
2015,2037462
2016,1669123
2017,1638834
2018,2185662
2019,2421534
2020,577379


In [46]:
write.csv(features_all_year, file = file.path(featuredir, "2_9_coh4_features_all_long_year.csv"), row.names=FALSE) 

### TEST - OLD 
with 2_9_features_all_long.csv did not contain testbins with Conor's code, in SQL

In [106]:
yr <- read.csv(file.path(featuredir, '2_9_features_all_long_year.csv'))
yr0 <- yr %>% select(-year)
nrow(yr0)
nrow(features_all)
identical(yr0, features_all)
all.equal(yr0,features_all)

In [108]:
yr0 <- yr0 %>% arrange(anon_id, pat_enc_csn_id_coded, admit_time, feature_type, features, values)
ft <- features_all %>% arrange(anon_id, pat_enc_csn_id_coded, admit_time, feature_type, features, values)

In [109]:
identical(yr0, ft)
all.equal(yr0, ft)

In [100]:
joined <- inner_join(yr, features_all)

Joining, by = c("anon_id", "pat_enc_csn_id_coded", "admit_time", "feature_type", "features", "values")



In [101]:
summary(joined)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values              year     
 Length:8258975     Min.   :   0.000   Min.   :2015  
 Class :character   1st Qu.:   1.000   1st Qu.:2016  
 Mode  :character   Median :   1.000   Median :2017  
                    Mean   :   3.721   Mean   :2017  
                    3rd Qu.:   2.000   3rd Qu.:2019  
                    Max.   :1078.000   Max.   :2020  

In [104]:
summary(yr)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values              year     
 Length:8258975     Min.   :   0.000   Min.   :2015  
 Class :character   1st Qu.:   1.000   1st Qu.:2016  
 Mode  :character   Median :   1.000   Median :2017  
                    Mean   :   3.721   Mean   :2017  
                    3rd Qu.:   2.000   3rd Qu.:2019  
                    Max.   :1078.000   Max.   :2020  

In [105]:
summary(features_all)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values        
 Length:8258975     Min.   :   0.000  
 Class :character   1st Qu.:   1.000  
 Mode  :character   Median :   1.000  
                    Mean   :   3.721  
                    3rd Qu.:   2.000  
                    Max.   :1078.000  