### Description:
- Read in the decile-binned values of labs and vitals
- Read in the data `coh4_order_code_counts` from sql order counts, combine Microbiology Culture with Microbiology 
- Combine this featurized data for full complex data modeling
- Create a smaller random sample for testing 
    
**Inputs**: 
- `6_10_coh4_binned_labs_vitals_train`: from feature_values, binned into deciles, 2015 - 03/2020
- `6_10_coh4_binned_labs_vitals_test`: 2015 - 03/2020 + 04/2020 - 2021
- `coh4_order_code_counts` and `coh4_order_code_counts_2021`: (from SQL) queried counts for different orders
    
**Outputs**: 
- have demo, vitals and labs in binned counts, order counts: long format, with year extracted
- `6_11_coh4_features_all_long_year`: will be input to 3_models, sparse_matrix.py
    - 2015 - 2018 as trained data for value distributions to bin validation data of 2018
        - used for training and select model hyperparameters based on validation data
    - test data 2019 - 03/2020 left unused
    - after training and selecting hyperparameters, re-do value distribution on 2015 - 03/2020 (train + val) data
    - test data (04/2020 - 2021) are now binned on based on this new train/val distribution
    - binned test data is used for the final prediction and evaluation of new model after THICK DESCRIPTION work


In [1]:
library(data.table)
library(tidyverse)
library(lubridate)
# library(Matrix)
# library(slam)
# library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

In [1]:
# install.packages(c("dbplyr", "RSQLite"))

Installing packages into ‘/home/jupyter/.R/library’
(as ‘lib’ is unspecified)

also installing the dependency ‘plogr’




In [2]:
datadir = "../../DataTD"
datadir6 = "../../DataTD/validation"
valdir = "../../OutputTD/6_validation"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

In [13]:
cohort4_6 <- read.csv(file.path(valdir, "6_7_cohort4.csv")) # new cohort
cohort4 <- read.csv(file.path(valdir, "6_7_cohort4_all.csv")) # both old and new cohort, distinct csn
cohort4_0 <- read.csv(file.path(cohortdir, "1_4_cohort.csv"))
nrow(cohort4_0)
nrow(cohort4_6)
nrow(cohort4)

In [15]:
length(setdiff(cohort4_0$pat_enc_csn_id_coded, cohort4_6$pat_enc_csn_id_coded))
length(setdiff(cohort4_6$pat_enc_csn_id_coded, cohort4_0$pat_enc_csn_id_coded))

length(setdiff(cohort4_0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded))
length(setdiff(cohort4_6$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded))
length(unique(cohort4$pat_enc_csn_id_coded))

In [3]:
# this one used 1_4_cohort to query orders
orders0 <- read.csv(file.path(datadir, 'coh4_order_code_counts.csv'))
nrow(orders0) #6086852

In [16]:
# this one used cohort4_validation to query orders
orders6 <- read.csv(file.path(datadir6, 'coh4_order_code_counts_2021.csv'))
nrow(orders6) # 2792909

In [26]:
# no overlapping of csn 
length(unique(orders6$pat_enc_csn_id_coded))
length(unique(orders0$pat_enc_csn_id_coded))

length(setdiff(orders6$pat_enc_csn_id_coded, orders0$pat_enc_csn_id_coded))
length(setdiff(orders0$pat_enc_csn_id_coded, orders6$pat_enc_csn_id_coded)) # 10 patients did not have any order

head(orders6, 1)
head(orders0, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC641497,131284412456,2020-04-01 17:34:00+00:00,Diagnosis,J10.00,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,R19.7,1


In [18]:
orders0 %>% group_by(feature_type) %>% count(sort=TRUE)
orders6 %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2834516
Meds,1868195
Lab,993484
Imaging,299944
Procedures,51696
Microbiology Culture,37374
Microbiology,1643


feature_type,n
<chr>,<int>
Diagnosis,1430950
Meds,771612
Lab,434245
Imaging,120143
Procedures,22881
Microbiology Culture,12617
Microbiology,461


In [19]:
orders0_micro <- orders0 %>% mutate(feature_type = ifelse(str_detect(feature_type, "Microbiology"), "Microbiology", feature_type))
orders0_micro %>% group_by(feature_type) %>% count(sort=TRUE)

orders6_micro <- orders6 %>% mutate(feature_type = ifelse(str_detect(feature_type, "Microbiology"), "Microbiology", feature_type))
orders6_micro %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2834516
Meds,1868195
Lab,993484
Imaging,299944
Procedures,51696
Microbiology,39017


feature_type,n
<chr>,<int>
Diagnosis,1430950
Meds,771612
Lab,434245
Imaging,120143
Procedures,22881
Microbiology,13078


In [20]:
orders0_micro %>% group_by(feature_type) %>% summarise(avg = mean(values), sd = sd(values))
orders6_micro %>% group_by(feature_type) %>% summarise(avg = mean(values), sd = sd(values))

feature_type,avg,sd
<chr>,<dbl>,<dbl>
Diagnosis,1.0,0.0
Imaging,3.234977,5.090007
Lab,7.360603,16.858606
Meds,2.940662,6.372316
Microbiology,2.281236,0.782586
Procedures,2.122195,4.988535


feature_type,avg,sd
<chr>,<dbl>,<dbl>
Diagnosis,1.0,0.0
Imaging,3.432143,6.1836564
Lab,7.15404,17.0763647
Meds,3.112441,6.3900902
Microbiology,2.307081,0.7750193
Procedures,2.005725,4.0376411


In [21]:
length(unique(orders0_micro$pat_enc_csn_id_coded)) # 43970
length(unique(orders6_micro$pat_enc_csn_id_coded)) # 16484

In [22]:
setdiff(orders0_micro$pat_enc_csn_id_coded, cohort4_0$pat_enc_csn_id_coded) # 0 
setdiff(cohort4_0$pat_enc_csn_id_coded, orders0_micro$pat_enc_csn_id_coded) # 10
# not everyone has orders
# for the new cohort, every has orders

In [24]:
head(orders6_micro,1)
head(orders0_micro,1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC641497,131284412456,2020-04-01 17:34:00+00:00,Diagnosis,J10.00,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,R19.7,1


In [27]:
# combine BOTH
# joined cohort is 60464, and 10 didn't have order in the old cohort
# so here is 60464 - 10 = 60454! correct
orders_micro <- bind_rows(orders0_micro, orders6_micro)# %>% distinct(pat_enc_csn_id_coded)
nrow(orders_micro) # 8917512
length(unique(orders_micro$pat_enc_csn_id_coded))

In [30]:
# setfiff(A, B) = those are in A, but not in B
setdiff(orders_micro$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded) # 0 
setdiff(cohort4$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded) # 10
# the 10 diff is exactly 10 in old cohort who didn't have orders

### Get the demographic features from each cohort, then combine

#### Run the old cohort

In [31]:
# check all features with values
values0 <- read.csv(file.path(featuredir, '2_7_coh4_feature_values.csv'))
nrow(values0) # coh4 3085046 
length(unique(values0$pat_enc_csn_id_coded)) # 43980 
values0 %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
demo,747660
labs,1358669
vitals,978717


In [32]:
# get demographics features
demos0 <- values0 %>% filter(feature_type == 'demo') %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                                              feature_type, features, values)
nrow(demos0) # coh4 747660 
length(unique(demos0$pat_enc_csn_id_coded)) # 43980

#### Run the new cohort

In [33]:
# check all features with values
values6 <- read.csv(file.path(valdir, '6_9_coh4_feature_values.csv'))
nrow(values6) # coh4 1175680 
length(unique(values6$pat_enc_csn_id_coded)) # 16484
values6 %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
demo,280228
labs,506845
vitals,388607


In [35]:
# get demographics features
demos6 <- values6 %>% filter(feature_type == 'demo') %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                                              feature_type, features, values)
nrow(demos6) # coh4 280228 (old cohort 747660)
length(unique(demos6$pat_enc_csn_id_coded)) # 16484 (old cohort 43980)

In [39]:
# setfiff(A, B) = those are in A, but not in B # sanity check - no overlapping
length(setdiff(demos6$pat_enc_csn_id_coded, demos0$pat_enc_csn_id_coded)) #  
length(setdiff(demos0$pat_enc_csn_id_coded, demos6$pat_enc_csn_id_coded)) # 

length(setdiff(demos6$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) #  
length(setdiff(cohort4$pat_enc_csn_id_coded, demos6$pat_enc_csn_id_coded)) # 

length(setdiff(demos0$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) #  
length(setdiff(cohort4$pat_enc_csn_id_coded, demos0$pat_enc_csn_id_coded)) #

In [56]:
# combine BOTH
demos <- bind_rows(demos0, demos6) %>% distinct() #1027888
nrow(demos)

demos <- bind_rows(demos0, demos6) #1027888
nrow(demos)

length(unique(demos$pat_enc_csn_id_coded))

# sanity check
length(unique(demos$pat_enc_csn_id_coded)) # 60464
length(unique(cohort4$pat_enc_csn_id_coded)) # 60464

length(setdiff(cohort4$pat_enc_csn_id_coded, demos$pat_enc_csn_id_coded)) #
length(setdiff(demos$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded)) #

In [59]:
demos %>% group_by(feature_type) %>% count()
demos %>% group_by(features) %>% count()

feature_type,n
<chr>,<int>
demo,1027888


features,n
<chr>,<int>
age,60464
delta_ESI,60464
delta_H,60464
delta_W,60464
English,60464
ESI_i,60464
gender,60464
Height_i,60464
insurance,60464
race.Asian,60464


### Check the featurized binned labs_vitals
- Keep them both (`..._train` and `..._test`) in the same dataframe
- Run `2_7_feature_values`, `2_8_binned_labs_vitals_train`, and `2_8_binned_labs_vitals_test` 
- Rerun the old cohort and new cohort separately, then combine the demos and order counts features
- The featurirzed labs and vitals are already combined in the previous notebook 6.10

In [44]:
# read in train_binned labs and vitals
trainbinned_labs_vitals0 <- read.csv(file.path(featuredir, '2_8_coh4_binned_labs_vitals_train.csv'))
nrow(trainbinned_labs_vitals0) #coh4: old 1825210
length(unique(trainbinned_labs_vitals0$pat_enc_csn_id_coded)) #43980
trainbinned_labs_vitals0 %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_train,1287050
vitals_train,538160


In [45]:
# read in train_binned labs and vitals with the new cohort
trainbinned_labs_vitals <- read.csv(file.path(valdir, '6_10_coh4_binned_labs_vitals_train.csv'))
nrow(trainbinned_labs_vitals) #coh4 1826919
length(unique(trainbinned_labs_vitals$pat_enc_csn_id_coded)) #43980
trainbinned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_train,1287256
vitals_train,539663


In [46]:
# read in test_binned labs and vitals OLD cohort
testbinned_labs_vitals0 <- read.csv(file.path(featuredir, '2_8_coh4_binned_labs_vitals_test.csv'))
nrow(testbinned_labs_vitals0) #coh4  1826919
length(unique(testbinned_labs_vitals0$pat_enc_csn_id_coded)) # 43980
testbinned_labs_vitals0 %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_test,1287256
vitals_test,539663


In [47]:
# read in test_binned labs and vitals, new cohort
testbinned_labs_vitals <- read.csv(file.path(valdir, '6_10_coh4_binned_labs_vitals_test.csv'))
nrow(testbinned_labs_vitals) #coh4 2534674 
length(unique(testbinned_labs_vitals$pat_enc_csn_id_coded)) # 60463
testbinned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_test,1778502
vitals_test,756172


In [48]:
head(trainbinned_labs_vitals, 1)
head(testbinned_labs_vitals, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1000116,131295313275,2020-09-29 22:45:00+00:00,labs_results_test,ALB_7,1


### Combine all features

In [60]:
head(demos, 1)
head(orders_micro,1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,R19.7,1


In [61]:
# combine all features for the complex data set, remove time column since demo doesn't have it
features_all <- bind_rows(demos, orders_micro, trainbinned_labs_vitals, testbinned_labs_vitals)
nrow(features_all) # 14269242 
length(unique(features_all$pat_enc_csn_id_coded)) # 60464
length(unique(features_all$features)) # 49278
features_all %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,4265466
Meds,2639807
labs_results_test,1778502
Lab,1427729
labs_results_train,1287256
demo,1027888
vitals_test,756172
vitals_train,539663
Imaging,420087
Procedures,74577


In [70]:
head(features_all, 1)
tail(features_all, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,demo,ESI_i,3


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
14269242,JCec489c,131226296895,2017-05-08 01:56:00+00:00,labs_results_test,eGFR_5,1


In [63]:
# sanity check 
n2 <- features_all %>% select(feature_type, features) %>% distinct() %>% 
        group_by(features) %>% count(sort=TRUE) %>% filter(n==2)
head(n2)
tail(n2)

features,n
<chr>,<int>
ALB_0,2
ALB_1,2
ALB_10,2
ALB_2,2
ALB_3,2
ALB_4,2


features,n
<chr>,<int>
WBC_4,2
WBC_5,2
WBC_6,2
WBC_7,2
WBC_8,2
WBC_9,2


In [64]:
features_all %>% filter(features %in% n2$features) %>% distinct(feature_type)# group_by(features, feature_type) %>% count()

feature_type
<chr>
Lab
Microbiology
Procedures
Imaging
Meds
labs_results_train
vitals_train
labs_results_test
vitals_test


In [71]:
features_all %>% group_by(feature_type, features) %>% count(sort=TRUE) %>% head(30)

feature_type,features,n
<chr>,<chr>,<int>
demo,age,60464
demo,delta_ESI,60464
demo,delta_H,60464
demo,delta_W,60464
demo,English,60464
demo,ESI_i,60464
demo,gender,60464
demo,Height_i,60464
demo,insurance,60464
demo,race.Asian,60464


In [73]:
nrow(features_all) # 14269242
summary(features_all$values)
sum(is.na(features_all$first_label))

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   0.000    1.000    1.000    3.393    2.000 1129.000 

In [75]:
features_all_year <- features_all %>% mutate(year = year(admit_time))
features_all_year %>% group_by(year) %>% count()

year,n
<dbl>,<int>
2015,2017957
2016,1658805
2017,1634688
2018,2182233
2019,2417945
2020,2276618
2021,2080996


In [79]:
cohort4 %>% mutate(year = year(admit_time)) %>% group_by(year) %>% count()

year,n
<dbl>,<int>
2015,8138
2016,7050
2017,6842
2018,9532
2019,10115
2020,9768
2021,9019


In [85]:
nrow(features_all)
length(unique(features_all$pat_enc_csn_id_coded)) # 60464
length(setdiff(features_all$pat_enc_csn_id_coded, cohort4$pat_enc_csn_id_coded))
length(setdiff(cohort4$pat_enc_csn_id_coded, features_all$pat_enc_csn_id_coded))

In [84]:
# write.csv(features_all_year, file = file.path(featuredir, "2_9_coh4_features_all_long_year.csv"), row.names=FALSE) 
write.csv(features_all_year, file = file.path(valdir, "6_11_coh4_all_features_all_long_year.csv"), row.names=FALSE) 

In [105]:
summary(features_all)

   anon_id          pat_enc_csn_id_coded  admit_time        feature_type      
 Length:8258975     Min.   :1.311e+11    Length:8258975     Length:8258975    
 Class :character   1st Qu.:1.312e+11    Class :character   Class :character  
 Mode  :character   Median :1.312e+11    Mode  :character   Mode  :character  
                    Mean   :1.312e+11                                         
                    3rd Qu.:1.313e+11                                         
                    Max.   :1.313e+11                                         
   features             values        
 Length:8258975     Min.   :   0.000  
 Class :character   1st Qu.:   1.000  
 Mode  :character   Median :   1.000  
                    Mean   :   3.721  
                    3rd Qu.:   2.000  
                    Max.   :1078.000  