### Description:
**REMOVE 1 patient visit = 131289492397 from the final cohort4**
- Read in the decile-binned values of labs and vitals
- Read in the data `coh4_order_code_counts` from sql order counts, combine Microbiology Culture with Microbiology 
- Combine this featurized data for full complex data modeling
- Create a smaller random sample for testing 
    
**Inputs**: 
- `6_10_coh4_binned_labs_vitals_train`: from feature_values, binned into deciles, 2015 - 03/2020
- `6_10_coh4_binned_labs_vitals_test`: 2015 - 03/2020 + 04/2020 - 2021
- `coh4_order_code_counts` and `coh4_order_code_counts_2021`: (from SQL) queried counts for different orders
    
**Outputs**: 
- have demo, vitals and labs in binned counts, order counts: long format, with year extracted
- `6_11_coh4_features_all_long_year`: will be input to 3_models, sparse_matrix.py
    - 2015 - 2018 as trained data for value distributions to bin validation data of 2018
        - used for training and select model hyperparameters based on validation data
    - test data 2019 - 03/2020 left unused
    - after training and selecting hyperparameters, re-do value distribution on 2015 - 03/2020 (train + val) data
    - test data (04/2020 - 2021) are now binned on based on this new train/val distribution
    - binned test data is used for the final prediction and evaluation of new model after THICK DESCRIPTION work


In [77]:
library(data.table)
library(tidyverse)
library(lubridate)
# library(Matrix)
# library(slam)
# library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [78]:
# install.packages(c("dbplyr", "RSQLite"))

In [79]:
datadir = "../../DataTD/shc2021"
outdir = "../../OutputTD/shc2021"

In [80]:
cohort4 <- read.csv(file.path(outdir, '11_coh4_noOR_noDKA_edtime_notecount.csv'))
nrow(cohort4)

cohort <- cohort4 %>% select(anon_id, pat_enc_csn_id_coded, admit_time, first_label,
                             death_3hr_recent_label, death_6hr_recent_label, death_9hr_recent_label,
                             death_12hr_recent_label, death_24hr_recent_label)
nrow(cohort)

In [81]:
# this one used cohort noOR only to query orders
orders <- read.csv(file.path(datadir, 'coh4_noOR_order_code_counts_2021.csv'))
nrow(orders) #8333201

In [82]:
head(orders, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1170548,131062572931,2015-01-02 03:48:00+00:00,Diagnosis,E10.10,1


In [83]:
orders %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,4169163
Meds,2388260
Lab,1290428
Imaging,371008
Procedures,66677
Microbiology Culture,45617
Microbiology,2048


In [84]:
orders_micro <- orders %>% mutate(feature_type = ifelse(str_detect(feature_type, "Microbiology"), "Microbiology", feature_type))
orders_micro %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,4169163
Meds,2388260
Lab,1290428
Imaging,371008
Procedures,66677
Microbiology,47665


In [85]:
orders_micro %>% group_by(feature_type) %>% summarise(avg = mean(values), sd = sd(values))

feature_type,avg,sd
<chr>,<dbl>,<dbl>
Diagnosis,1.0,0.0
Imaging,3.34775,5.5440964
Lab,7.429079,17.2114416
Meds,3.042515,6.4621015
Microbiology,2.286646,0.7761681
Procedures,2.103064,4.8880191


In [86]:
length(unique(orders_micro$pat_enc_csn_id_coded)) # 52525
length(setdiff(orders_micro$pat_enc_csn_id_coded, cohort$pat_enc_csn_id_coded)) # 686 # ok because this is from cohort noOR
length(setdiff(cohort$pat_enc_csn_id_coded, orders_micro$pat_enc_csn_id_coded)) # 7
# not everyone has orders
# for the new cohort, every has orders

In [87]:
head(orders_micro, 1)
head(cohort, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1170548,131062572931,2015-01-02 03:48:00+00:00,Diagnosis,E10.10,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00,0,0,0,0,0,0


In [88]:
orders_micro <- orders_micro %>% filter(pat_enc_csn_id_coded %in% cohort$pat_enc_csn_id_coded)
length(unique(orders_micro$pat_enc_csn_id_coded)) # 51839 (7 missing)

### Get the demographic features from each cohort, then combine

#### Run the old cohort

In [89]:
# check all features with values
values <- read.csv(file.path(outdir, '12_coh4_noOR_noDKA_feature_values.csv'))
nrow(values) # coh4 3867101 
length(unique(values$pat_enc_csn_id_coded)) # 51846 
values %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
demo,881382
labs,1610383
time_count,207384
vitals,1167952


In [90]:
# get demographics features
demos <- values %>% filter(feature_type == 'demo') %>% select(anon_id, pat_enc_csn_id_coded, admit_time, 
                                                              feature_type, features, values)
nrow(demos) # coh4 881382 
length(unique(demos$pat_enc_csn_id_coded)) # 51846

In [91]:
demos %>% group_by(feature_type) %>% count()
demos %>% group_by(features) %>% count()

feature_type,n
<chr>,<int>
demo,881382


features,n
<chr>,<int>
age,51846
delta_ESI,51846
delta_H,51846
delta_W,51846
English,51846
ESI_i,51846
gender,51846
Height_i,51846
insurance,51846
race.Asian,51846


### Check the featurized binned labs_vitals
- Keep them both (`..._train` and `..._test`) in the same dataframe
- Run `2_7_feature_values`, `2_8_binned_labs_vitals_train`, and `2_8_binned_labs_vitals_test` 
- Rerun the old cohort and new cohort separately, then combine the demos and order counts features
- The featurirzed labs and vitals are already combined in the previous notebook 6.10

In [125]:
# read in train_binned labs and vitals
# trainbinned_labs_vitals <- read.csv(file.path(outdir, '13_coh4_binned_labs_vitals_train.csv'))
trainbinned_labs_vitals <- read.csv(file.path(outdir, '13.2_coh4_binned_labs_vitals_train.csv'))

nrow(trainbinned_labs_vitals) #coh4: old 2185308
length(unique(trainbinned_labs_vitals$pat_enc_csn_id_coded)) # 51845
trainbinned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_train,1535102
vitals_train,643338


In [126]:
# this patient has no vitals or labs within 24 hours? difftime in featurized didn't take this one!!!! removed 
setdiff(cohort$pat_enc_csn_id_coded, trainbinned_labs_vitals$pat_enc_csn_id_coded)

In [127]:
# read in test_binned labs and vitals, new cohort
# testbinned_labs_vitals <- read.csv(file.path(outdir, '13_coh4_binned_labs_vitals_test.csv'))
testbinned_labs_vitals <- read.csv(file.path(outdir, '13.2_coh4_binned_labs_vitals_test.csv'))

nrow(testbinned_labs_vitals) #coh4 2184675 
length(unique(testbinned_labs_vitals$pat_enc_csn_id_coded)) # 51845
testbinned_labs_vitals %>% group_by(feature_type) %>% count()

feature_type,n
<chr>,<int>
labs_results_test,1535266
vitals_test,650027


In [96]:
head(trainbinned_labs_vitals, 1)
head(testbinned_labs_vitals, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00+00:00,labs_results_train,ALB_2,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00+00:00,labs_results_test,ALB_1,1


### Combine all features

In [97]:
head(demos, 1)
head(orders_micro,1)
head(trainbinned_labs_vitals, 1)
head(testbinned_labs_vitals, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,JC1000116,131066472308,2015-01-28 00:46:00,demo,ESI_i,3


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1170548,131062572931,2015-01-02 03:48:00+00:00,Diagnosis,E10.10,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00+00:00,labs_results_train,ALB_2,1


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00+00:00,labs_results_test,ALB_1,1


In [128]:
# combine all features for the complex data set, remove time column since demo doesn't have it
features_all <- bind_rows(demos, orders_micro, trainbinned_labs_vitals, testbinned_labs_vitals) %>%
                    filter(pat_enc_csn_id_coded != 131289492397) %>% mutate(admit_time = ymd_hms(admit_time))
nrow(features_all) # 14269242 
length(unique(features_all$pat_enc_csn_id_coded)) # 51845
length(unique(features_all$features)) # 49278
features_all %>% group_by(feature_type) %>% count(sort=TRUE)

feature_type,n
<chr>,<int>
Diagnosis,4117094
Meds,2354880
labs_results_test,1535266
labs_results_train,1535102
Lab,1274123
demo,881365
vitals_test,650027
vitals_train,643338
Imaging,367675
Procedures,65928


In [99]:
head(features_all, 1)
tail(features_all, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<dttm>,<chr>,<chr>,<dbl>
1,JC1000116,131066472308,2015-01-28 00:46:00,demo,ESI_i,3


Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values
Unnamed: 0_level_1,<chr>,<dbl>,<dttm>,<chr>,<chr>,<dbl>
13471900,JC999925,131274000000.0,2019-07-29 01:55:00,labs_results_test,eGFR_2,1


In [100]:
features_all %>% filter(pat_enc_csn_id_coded == 131062572931) %>% distinct(admit_time)

admit_time
<dttm>
2015-01-02 03:48:00


In [101]:
# sanity check 
n2 <- features_all %>% select(feature_type, features) %>% distinct() %>% 
        group_by(features) %>% count(sort=TRUE) %>% filter(n==2)
head(n2)
tail(n2)

features,n
<chr>,<int>
ALB_0,2
ALB_1,2
ALB_10,2
ALB_2,2
ALB_3,2
ALB_4,2


features,n
<chr>,<int>
WBC_4,2
WBC_5,2
WBC_6,2
WBC_7,2
WBC_8,2
WBC_9,2


In [102]:
features_all %>% filter(features %in% n2$features) %>% distinct(feature_type)# group_by(features, feature_type) %>% count()

feature_type
<chr>
Lab
Procedures
Microbiology
Imaging
Meds
labs_results_train
vitals_train
labs_results_test
vitals_test


In [103]:
features_all %>% group_by(feature_type, features) %>% count(sort=TRUE) %>% head(30)

feature_type,features,n
<chr>,<chr>,<int>
demo,age,51845
demo,delta_ESI,51845
demo,delta_H,51845
demo,delta_W,51845
demo,English,51845
demo,ESI_i,51845
demo,gender,51845
demo,Height_i,51845
demo,insurance,51845
demo,race.Asian,51845


In [129]:
names(features_all)
nrow(features_all) # 13584533
summary(features_all$values)
sum(is.na(features_all$values))

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   0.000    1.000    1.000    3.245    2.000 1129.000 

In [130]:
features_all_year <- features_all %>% mutate(year = year(admit_time)) #, month = month(admit_time))
features_all_year %>% group_by(year) %>% count()
# features_all_year %>% group_by(month) %>% count()

year,n
<dbl>,<int>
2015,1809832
2016,1504263
2017,1472043
2018,1987060
2019,2208136
2020,2342905
2021,2147661


In [107]:
cohort %>% mutate(year = year(admit_time)) %>% group_by(year) %>% count()

year,n
<dbl>,<int>
2015,7147
2016,6053
2017,5804
2018,8131
2019,8550
2020,8497
2021,7664


In [121]:
nrow(features_all_year)
length(unique(features_all_year$pat_enc_csn_id_coded)) # 60464
length(setdiff(features_all_year$pat_enc_csn_id_coded, cohort$pat_enc_csn_id_coded))
length(setdiff(cohort$pat_enc_csn_id_coded, features_all_year$pat_enc_csn_id_coded)) # has not remove the 1 ptvisit yet

In [109]:
head(cohort,3)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_3hr_recent_label,death_6hr_recent_label,death_9hr_recent_label,death_12hr_recent_label,death_24hr_recent_label
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00,0,0,0,0,0,0
2,JC1000116,131295313275,2020-09-29 22:45:00,0,0,0,0,0,0
3,JC1000296,131100574537,2015-07-03 04:51:00,0,0,0,0,0,0


In [110]:
head(features_all_year, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,feature_type,features,values,year
Unnamed: 0_level_1,<chr>,<dbl>,<dttm>,<chr>,<chr>,<dbl>,<dbl>
1,JC1000116,131066472308,2015-01-28 00:46:00,demo,ESI_i,3,2015


In [111]:
cohort_final <- cohort %>% filter(pat_enc_csn_id_coded != 131289492397)
nrow(cohort_final)
summary(cohort_final)

   anon_id          pat_enc_csn_id_coded  admit_time         first_label    
 Length:51845       Min.   :1.311e+11    Length:51845       Min.   :0.0000  
 Class :character   1st Qu.:1.312e+11    Class :character   1st Qu.:0.0000  
 Mode  :character   Median :1.313e+11    Mode  :character   Median :0.0000  
                    Mean   :1.312e+11                       Mean   :0.1023  
                    3rd Qu.:1.313e+11                       3rd Qu.:0.0000  
                    Max.   :1.313e+11                       Max.   :1.0000  
 death_3hr_recent_label death_6hr_recent_label death_9hr_recent_label
 Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
 1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
 Median :0.0000         Median :0.0000         Median :0.0000        
 Mean   :0.1066         Mean   :0.1082         Mean   :0.1085        
 3rd Qu.:0.0000         3rd Qu.:0.0000         3rd Qu.:0.0000        
 Max.   :1.0000         Max.   :1.0000   

In [131]:
length(setdiff(features_all_year$pat_enc_csn_id_coded, cohort_final$pat_enc_csn_id_coded)) # 686 # ok because this is from cohort noOR
length(setdiff(cohort_final$pat_enc_csn_id_coded, features_all_year$pat_enc_csn_id_coded)) # 7

# FINAL COHORT no OR no DKA, remove 1

In [123]:
# remove 1 single csn, noOR, noDKA
write.csv(cohort_final, file = file.path(outdir, "14_cohort_final.csv"), row.names=FALSE) 

In [132]:
# write.csv(features_all_year, file = file.path(outdir, "14_coh_all_features_all_long_year.csv"), row.names=FALSE)
write.csv(features_all_year, file = file.path(outdir, "14_2_coh_all_features_all_long_year.csv"), row.names=FALSE) 

In [114]:
summary(features_all_year)

   anon_id          pat_enc_csn_id_coded   admit_time                 
 Length:13471900    Min.   :1.311e+11    Min.   :2015-01-01 06:30:00  
 Class :character   1st Qu.:1.312e+11    1st Qu.:2017-01-14 04:45:00  
 Mode  :character   Median :1.313e+11    Median :2018-12-25 21:38:00  
                    Mean   :1.312e+11    Mean   :2018-09-28 18:57:48  
                    3rd Qu.:1.313e+11    3rd Qu.:2020-07-08 03:24:00  
                    Max.   :1.313e+11    Max.   :2021-09-30 07:00:00  
 feature_type         features             values              year     
 Length:13471900    Length:13471900    Min.   :   0.000   Min.   :2015  
 Class :character   Class :character   1st Qu.:   1.000   1st Qu.:2017  
 Mode  :character   Mode  :character   Median :   1.000   Median :2018  
                                       Mean   :   3.245   Mean   :2018  
                                       3rd Qu.:   2.000   3rd Qu.:2020  
                                       Max.   :1129.000   Max.   