## Descriptions:
Process 2 variables from demograhics table
- Process language --> English: yes/no
- Process insurance (medicare/caid/cal/mcal/mcare/na) --> yes/no

**Inputs**: 
- `1_2_cohort`: updated 1_1_cohort with inc/exc criteria, which also has demographics information

**Outputs**: 
- `2_1_coh2_demo`

### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




### Demographic variables
- Age (only >=18): done in in/ex criteria
- insurance = Medical/care/caid and n/a insurance --> medis =1 otherwise 0
- language --> Engl = 1 for English, otherwise 0
- leave height and weight to be processed with flowsheet age (>=18 only)

In [2]:
# datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"

cohort <- read.csv(file.path(cohortdir, "1_2_cohort.csv"))
nrow(cohort) #45794
summary(cohort) 

   anon_id          pat_enc_csn_id_coded     label         admit_time       
 Length:45794       Min.   :1.311e+11    Min.   :0.0000   Length:45794      
 Class :character   1st Qu.:1.312e+11    1st Qu.:0.0000   Class :character  
 Mode  :character   Median :1.312e+11    Median :0.0000   Mode  :character  
                    Mean   :1.312e+11    Mean   :0.1391                     
                    3rd Qu.:1.313e+11    3rd Qu.:0.0000                     
                    Max.   :1.313e+11    Max.   :1.0000                     
                                                                            
 inpatient_data_id_coded      ESI        hosp_admsn_time      ed_time_hr     
 Min.   :13616753        Min.   :1.000   Length:45794       Min.   :  0.000  
 1st Qu.:19857182        1st Qu.:2.000   Class :character   1st Qu.:  2.217  
 Median :27890836        Median :3.000   Mode  :character   Median :  3.467  
 Mean   :28153424        Mean   :2.654                      Mean   :  4.

In [3]:
head(cohort, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<int>
1,JC29f8ad2,131274729058,0,2019-08-31 12:52:00,40679773,3,2019-08-31 09:59:00,2.883333,Female,White,English,165.1,83.46,UNITED HEALTHCARE,2020-03-31,52


In [4]:
cohort <- cohort %>% select(-hosp_admsn_time, -ed_time_hr) %>% # hosp_admsn_time = ymd_hms(hosp_admsn_time)
                mutate(admit_time = ymd_hms(admit_time), recent_date = ymd(recent_date), 
                       gender = factor(gender), race = factor(race), language = factor(language), insurance = factor(insurance)) 
                
nrow(cohort)
summary(cohort)

   anon_id          pat_enc_csn_id_coded     label       
 Length:45794       Min.   :1.311e+11    Min.   :0.0000  
 Class :character   1st Qu.:1.312e+11    1st Qu.:0.0000  
 Mode  :character   Median :1.312e+11    Median :0.0000  
                    Mean   :1.312e+11    Mean   :0.1391  
                    3rd Qu.:1.313e+11    3rd Qu.:0.0000  
                    Max.   :1.313e+11    Max.   :1.0000  
                                                         
   admit_time                  inpatient_data_id_coded      ESI       
 Min.   :2015-01-01 08:24:00   Min.   :13616753        Min.   :1.000  
 1st Qu.:2016-05-21 09:49:30   1st Qu.:19857182        1st Qu.:2.000  
 Median :2017-12-31 23:32:00   Median :27890836        Median :3.000  
 Mean   :2017-10-13 02:11:20   Mean   :28153424        Mean   :2.654  
 3rd Qu.:2019-02-19 07:36:45   3rd Qu.:34378572        3rd Qu.:3.000  
 Max.   :2020-03-31 07:00:00   Max.   :48026130        Max.   :5.000  
                                       

In [5]:
# checking duplicates
nrow(cohort %>% select(anon_id) %>% distinct()) # 31511
nrow(cohort %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 45794
cohort[duplicated(cohort[, c('anon_id','pat_enc_csn_id_coded')]),]

# cohort %>% filter(anon_id == "JCd49287") # pat_enc_csn_id_coded = 131195706986, 
# cohort <- cohort %>% filter (!(anon_id == "JCd49287" & race == "Unknown"))

anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
<chr>,<dbl>,<int>,<dttm>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<date>,<int>


In [6]:
summary(cohort %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age))
cohort %>% gather(var, value) %>% distinct() %>% count(var) %>% arrange(n)
cohort %>% summarise_each(funs(n_distinct))

      ESI           gender                    race       recent_height_cm
 Min.   :1.000   Female:21898   Asian           : 6870   Min.   : 12.95  
 1st Qu.:2.000   Male  :23896   Black           : 3260   1st Qu.:160.02  
 Median :3.000                  Native American :  195   Median :167.64  
 Mean   :2.654                  Other           :10511   Mean   :167.97  
 3rd Qu.:3.000                  Pacific Islander:  930   3rd Qu.:175.26  
 Max.   :5.000                  Unknown         :  424   Max.   :210.82  
 NA's   :1861                   White           :23604   NA's   :1414    
 recent_weight_kg      age       
 Min.   :  0.1    Min.   :18.00  
 1st Qu.: 60.9    1st Qu.:44.00  
 Median : 73.1    Median :60.00  
 Mean   : 76.6    Mean   :58.28  
 3rd Qu.: 88.0    3rd Qu.:73.00  
 Max.   :320.7    Max.   :90.00  
 NA's   :310                     

“attributes are not identical across measure variables;
they will be dropped”


var,n
<chr>,<int>
gender,2
label,2
ESI,6
race,7
language,55
age,73
insurance,134
recent_date,457
recent_height_cm,789
recent_weight_kg,3445


“`summarise_each_()` was deprecated in dplyr 0.7.0.
Please use `across()` instead.
“`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
31511,45794,2,45260,45794,6,2,7,55,789,3445,134,457,73


### Insurance:

In [7]:
options(repr.matrix.max.rows=135, repr.matrix.max.cols=20)
cohort %>%                 # filter(!is.na(col)) %>% filter out all rows with NAs in col 
                group_by(insurance) %>% count() %>% arrange(desc(n))

insurance,n
<fct>,<int>
MEDICARE,16949
BLUE CROSS,3387
HPSM,3130
,2724
MEDI-CAL,2051
BLUE SHIELD,1891
SANTA CLARA FAMILY HP,1745
UNITED HEALTHCARE,1575
AETNA,1232
HPSM - MCARE ADV,1036


In [8]:
# check for medicare/ medical/ no insurance
# cohort %>% filter(str_detect(insurance, "MEDI") | insurance == "") %>% 
#                 group_by(insurance) %>% count() %>% arrange(-n)

# mcal = medical, mcare = medicare
medis <- c("MEDI", "MCAL", "MCARE")
cohort %>% filter(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "") %>% 
                group_by(insurance) %>% count() %>% arrange(-n)

insurance,n
<fct>,<int>
MEDICARE,16949
,2724
MEDI-CAL,2051
HPSM - MCARE ADV,1036
MCAL HMO - CENTRAL CALIFORNIA ALLIANCE FOR HEALTH,692
ALAMEDA ALLIANCE MCAL MGD CARE,658
BLUE CROSS MEDI-CAL,419
UHC MEDICARE AARP COMPLETE,383
SCFHP - MCARE ADV,363
SNF HOSPICE MEDICARE (HOSP ONLY),323


In [9]:
# turn insurance into medis which has "MEDI" under insurance
# insurance = 0 if patient has medis or no insurance, 1 for other insurance
medis <- c("MEDI", "MCAL", "MCARE")

nrow(cohort %>% filter(insurance == "")) # 2737
nrow(cohort %>% filter(str_detect(insurance, "MEDI"))) # 20766
nrow(cohort %>% filter(str_detect(insurance, paste(medis, collapse = "|")))) # 23987

cohort_demo <- cohort %>%
                    mutate(insurance = ifelse(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "", 0, 1)) %>%
                    group_by(anon_id, pat_enc_csn_id_coded) %>%
                    mutate(insurance = sum(insurance)) %>% ungroup() %>% # just to make sure if anyone else has more than 1
                    mutate(insurance = ifelse(insurance>0, 1, insurance)) 

cohort_demo %>% count(insurance)

insurance,n
<dbl>,<int>
0,26595
1,19199


### Language:

In [10]:
cohort_demo %>% group_by(language) %>% count() %>% arrange(desc(n))

language,n
<fct>,<int>
English,38611
Spanish,3672
Mandarin,737
Vietnamese,511
Cantonese,266
Russian,263
Farsi,261
Tongan,212
Tagalog,179
Korean,152


In [11]:
cohort_demo <- cohort_demo %>%
                    mutate(English = ifelse(language == "English", 1, 0)) %>% 
                    select(-c(language))

cohort_demo %>% count(English)
nrow(cohort_demo)
length(unique(cohort_demo$anon_id))
summary(cohort_demo %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age, insurance, English))

English,n
<dbl>,<int>
0,7183
1,38611


      ESI           gender                    race       recent_height_cm
 Min.   :1.000   Female:21898   Asian           : 6870   Min.   : 12.95  
 1st Qu.:2.000   Male  :23896   Black           : 3260   1st Qu.:160.02  
 Median :3.000                  Native American :  195   Median :167.64  
 Mean   :2.654                  Other           :10511   Mean   :167.97  
 3rd Qu.:3.000                  Pacific Islander:  930   3rd Qu.:175.26  
 Max.   :5.000                  Unknown         :  424   Max.   :210.82  
 NA's   :1861                   White           :23604   NA's   :1414    
 recent_weight_kg      age          insurance         English      
 Min.   :  0.1    Min.   :18.00   Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 60.9    1st Qu.:44.00   1st Qu.:0.0000   1st Qu.:1.0000  
 Median : 73.1    Median :60.00   Median :0.0000   Median :1.0000  
 Mean   : 76.6    Mean   :58.28   Mean   :0.4192   Mean   :0.8431  
 3rd Qu.: 88.0    3rd Qu.:73.00   3rd Qu.:1.0000   3rd Qu.:1.0000  


In [12]:
head(cohort_demo, n=1)
colnames(cohort_demo)

anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,gender,race,recent_height_cm,recent_weight_kg,insurance,recent_date,age,English
<chr>,<dbl>,<int>,<dttm>,<int>,<int>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<date>,<int>,<dbl>
JC29f8ad2,131274729058,0,2019-08-31 12:52:00,40679773,3,Female,White,165.1,83.46,1,2020-03-31,52,1


In [13]:
# save file cohort_demo
write.csv(cohort_demo, file = file.path(featuredir, "2_1_coh2_demo.csv"), row.names=FALSE) 