### Description:
- Read in the decile-binned values of labs and vitals `2_8_binned_labs_vitals`
- Read in the data `order_code_counts` from sql order counts, combine Microbiology Culture with Microbiology 
- Combine this featurized data for full complex data modeling
- Create a smaller random sample for testing 

Inputs: 
- `2_8_binned_labs_vitals`: from feature_values --> binned into deciles, which were formed using the training set (2015 - 2017)
- `order_code_counts`: from SQL querried counts for different orders

Outputs:
- `2_9_feature_counts`: both 

In [1]:
library(data.table)
library(tidyverse)
library(lubridate)
# library(Matrix)
# library(slam)
# library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

In [1]:
# install.packages(c("dbplyr", "RSQLite"))

Installing packages into ‘/home/jupyter/.R/library’
(as ‘lib’ is unspecified)

also installing the dependency ‘plogr’




In [2]:
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

In [3]:
cohort <- read.csv(file.path(cohortdir,  '1_5_cohort_final.csv'))
nrow(cohort)

In [19]:
orders <- read.csv(file.path(datadir, 'order_code_counts.csv'))
nrow(orders)

In [5]:
head(orders)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I95.9,1
2,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,F41.1,1
3,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I12.9,1
4,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,I99.8,1
5,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Lab,"MAGNESIUM, SERUM/PLASMA",1
6,JCe78a06,131062667066,2015-01-02 01:01:00+00:00,Diagnosis,T45.515A,1


In [16]:
orders %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2680945
Meds,1747295
Lab,932873
Imaging,282646
Procedures,48863
Microbiology Culture,36447
Microbiology,1609


In [20]:
orders_micro <- orders %>% mutate(feature_type = ifelse(str_detect(feature_type, "Microbiology"), "Microbiology", feature_type))
orders_micro %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,2680945
Meds,1747295
Lab,932873
Imaging,282646
Procedures,48863
Microbiology,38056


In [21]:
length(unique(orders_micro$pat_enc_csn_id_coded))

### Check the featurized binned labs_vitals and combine all 

In [22]:
binned_labs_vitals <- read.csv(file.path(featuredir, '2_8_binned_labs_vitals.csv'))
nrow(binned_labs_vitals)
length(unique(binned_labs_vitals$pat_enc_csn_id_coded))

In [23]:
head(binned_labs_vitals)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALK_7,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALT_0,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AST_1,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,BUN_8,1


In [24]:
binned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_train,1304104
vitals_train,520971


In [25]:
setdiff(binned_labs_vitals$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded)
setdiff(orders_micro$pat_enc_csn_id_coded, binned_labs_vitals$pat_enc_csn_id_coded)

In [29]:
featurized_counts <- bind_rows(binned_labs_vitals, orders_micro)
nrow(featurized_counts)
length(unique(featurized_counts$pat_enc_csn_id_coded))
featurized_counts %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
Diagnosis,2680945
Imaging,282646
Lab,932873
labs_results_train,1304104
Meds,1747295
Microbiology,38056
Procedures,48863
vitals_train,520971


In [27]:
head(featurized_counts)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALB_3,1
2,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALK_7,1
3,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,ALT_0,1
4,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AST_1,1
5,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,AnionGap_9,1
6,JC29f8ad2,131274729058,2019-08-31 12:52:00+00:00,labs_results_train,BUN_8,1


In [30]:
write.csv(featurized_counts, file = file.path(featuredir, "2_9_feature_counts.csv"), row.names=FALSE) 