## Descriptions:
Process 2 variables from demograhics table, using cohort2
- Process language --> English: yes/no
- Process insurance (medicare/caid/cal/mcal/mcare/na) --> yes/no

**Changes:**
- Use shc_core_2021
- Remove any `recent_date` related contents

**Inputs**: 
- `6_2_cohort2`: updated `6_1_cohort1` with inc/exc criteria, which also has demographics information

**Outputs**: 
- `3_coh2_demo`

### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




### Demographic variables
- Age (only >=18): done in in/ex criteria
- insurance = Medical/care/caid and n/a insurance --> medis =1 otherwise 0
- language --> Engl = 1 for English, otherwise 0
- leave height and weight to be processed with flowsheet age (>=18 only)

In [3]:
# datadir = "../../DataTD"
datadir = "../../DataTD/shc2021"
outdir = "../../OutputTD/shc2021"

cohort <- read.csv(file.path(outdir, "2_cohort2.csv"))
nrow(cohort) # 17128
summary(cohort) 

   anon_id          pat_enc_csn_id_coded admit_time_jittered     label      
 Length:63050       Min.   :1.311e+11    Length:63050        Min.   :0.000  
 Class :character   1st Qu.:1.312e+11    Class :character    1st Qu.:0.000  
 Mode  :character   Median :1.313e+11    Mode  :character    Median :0.000  
                    Mean   :1.312e+11                        Mean   :0.139  
                    3rd Qu.:1.313e+11                        3rd Qu.:0.000  
                    Max.   :1.313e+11                        Max.   :1.000  
                                                                            
  admit_time           adm_year      adm_month      inpatient_data_id_coded
 Length:63050       Min.   :2015   Min.   : 1.000   Min.   :18839023       
 Class :character   1st Qu.:2016   1st Qu.: 3.000   1st Qu.:27991297       
 Mode  :character   Median :2018   Median : 6.000   Median :38223187       
                    Mean   :2018   Mean   : 6.347   Mean   :41283830       
    

In [4]:
head(cohort, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<int>
1,JC1000116,131066472308,2015-01-28 00:46:00+00:00,0,2015-01-28 00:46:00,2015,1,19328596,3,2015-01-27 04:24:00,20.36667,Female,Other,Spanish,154,73.05,HPSM,38


In [5]:
cohort <- cohort %>% select(-hosp_admsn_time, -ed_time_hr) %>% # hosp_admsn_time = ymd_hms(hosp_admsn_time)
                mutate(admit_time = ymd_hms(admit_time), # recent_date = ymd(recent_date), 
                       gender = factor(gender), race = factor(race), language = factor(language), insurance = factor(insurance)) 
                
nrow(cohort)
summary(cohort)

   anon_id          pat_enc_csn_id_coded admit_time_jittered     label      
 Length:63050       Min.   :1.311e+11    Length:63050        Min.   :0.000  
 Class :character   1st Qu.:1.312e+11    Class :character    1st Qu.:0.000  
 Mode  :character   Median :1.313e+11    Mode  :character    Median :0.000  
                    Mean   :1.312e+11                        Mean   :0.139  
                    3rd Qu.:1.313e+11                        3rd Qu.:0.000  
                    Max.   :1.313e+11                        Max.   :1.000  
                                                                            
   admit_time                     adm_year      adm_month     
 Min.   :2015-01-01 04:52:00   Min.   :2015   Min.   : 1.000  
 1st Qu.:2016-12-27 04:51:15   1st Qu.:2016   1st Qu.: 3.000  
 Median :2018-11-15 00:08:30   Median :2018   Median : 6.000  
 Mean   :2018-09-02 16:29:46   Mean   :2018   Mean   : 6.347  
 3rd Qu.:2020-05-27 07:31:15   3rd Qu.:2020   3rd Qu.: 9.000  
 Max. 

In [6]:
# checking duplicates
nrow(cohort %>% select(anon_id) %>% distinct()) # 31511
nrow(cohort %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 45794
cohort[duplicated(cohort[, c('anon_id','pat_enc_csn_id_coded')]),]

# cohort %>% filter(anon_id == "JCd49287") # pat_enc_csn_id_coded = 131195706986, 
# cohort <- cohort %>% filter (!(anon_id == "JCd49287" & race == "Unknown"))

anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
<chr>,<dbl>,<chr>,<int>,<dttm>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<int>


In [7]:
summary(cohort %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age))
cohort %>% gather(var, value) %>% distinct() %>% count(var) %>% arrange(n)
cohort %>% summarise_each(funs(n_distinct))

      ESI            gender                    race       recent_height_cm
 Min.   :1.000   Female :29971   Asian           : 9713   Min.   : 12.95  
 1st Qu.:2.000   Male   :33072   Black           : 4409   1st Qu.:160.02  
 Median :3.000   Unknown:    7   Native American :  262   Median :167.64  
 Mean   :2.625                   Other           :15029   Mean   :168.01  
 3rd Qu.:3.000                   Pacific Islander: 1301   3rd Qu.:175.30  
 Max.   :5.000                   Unknown         :  440   Max.   :213.36  
 NA's   :1967                    White           :31896   NA's   :1658    
 recent_weight_kg      age        
 Min.   :  0.10   Min.   : 18.00  
 1st Qu.: 60.90   1st Qu.: 45.00  
 Median : 73.30   Median : 61.00  
 Mean   : 76.73   Mean   : 58.82  
 3rd Qu.: 88.30   3rd Qu.: 73.00  
 Max.   :504.85   Max.   :118.00  
 NA's   :284                      

“attributes are not identical across measure variables;
they will be dropped”


var,n
<chr>,<int>
label,2
gender,3
ESI,6
adm_year,7
race,7
adm_month,12
language,56
age,93
insurance,117
recent_height_cm,876


“`summarise_each_()` was deprecated in dplyr 0.7.0.
Please use `across()` instead.
“`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
42037,63050,62315,2,62315,7,12,63050,6,3,7,56,876,3721,117,93


### Insurance:

In [8]:
options(repr.matrix.max.rows=135, repr.matrix.max.cols=20)
cohort %>%                 # filter(!is.na(col)) %>% filter out all rows with NAs in col 
                group_by(insurance) %>% count() %>% arrange(desc(n))

insurance,n
<fct>,<int>
MEDICARE,23706
BLUE CROSS,4463
HPSM,4243
,3672
MEDI-CAL,2581
BLUE SHIELD,2578
SANTA CLARA FAMILY HP,2443
UNITED HEALTHCARE,2119
AETNA,1667
HPSM - MCARE ADV,1465


In [9]:
# check for medicare/ medical/ no insurance
# cohort %>% filter(str_detect(insurance, "MEDI") | insurance == "") %>% 
#                 group_by(insurance) %>% count() %>% arrange(-n)

# mcal = medical, mcare = medicare
medis <- c("MEDI", "MCAL", "MCARE")
cohort %>% filter(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "") %>% 
                group_by(insurance) %>% count() %>% arrange(-n)

insurance,n
<fct>,<int>
MEDICARE,23706
,3672
MEDI-CAL,2581
HPSM - MCARE ADV,1465
MCAL HMO - CENTRAL CALIFORNIA ALLIANCE FOR HEALTH,1022
ALAMEDA ALLIANCE MCAL MGD CARE,1011
UHC MEDICARE AARP COMPLETE,664
BLUE CROSS MEDI-CAL,634
SCFHP - MCARE ADV,553
SNF HOSPICE MEDICARE (HOSP ONLY),383


In [10]:
# turn insurance into medis which has "MEDI" under insurance
# insurance = 0 if patient has medis or no insurance, 1 for other insurance
medis <- c("MEDI", "MCAL", "MCARE")

nrow(cohort %>% filter(insurance == "")) # 2737
nrow(cohort %>% filter(str_detect(insurance, "MEDI"))) # 20766
nrow(cohort %>% filter(str_detect(insurance, paste(medis, collapse = "|")))) # 23987

cohort_demo <- cohort %>%
                    mutate(insurance = ifelse(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "", 0, 1)) %>%
                    group_by(anon_id, pat_enc_csn_id_coded) %>%
                    mutate(insurance = sum(insurance)) %>% ungroup() %>% # just to make sure if anyone else has more than 1
                    mutate(insurance = ifelse(insurance>0, 1, insurance)) 

cohort_demo %>% count(insurance)

insurance,n
<dbl>,<int>
0,37063
1,25987


### Language:

In [11]:
cohort_demo %>% group_by(language) %>% count() %>% arrange(desc(n))

language,n
<fct>,<int>
English,52631
Spanish,5412
Mandarin,990
Vietnamese,766
Cantonese,393
Russian,363
Farsi,339
Tongan,323
Tagalog,256
Korean,203


In [12]:
cohort_demo <- cohort_demo %>%
                    mutate(English = ifelse(language == "English", 1, 0)) %>% 
                    select(-c(language))

cohort_demo %>% count(English)
nrow(cohort_demo)
length(unique(cohort_demo$anon_id))
summary(cohort_demo %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age, insurance, English))

English,n
<dbl>,<int>
0,10419
1,52631


      ESI            gender                    race       recent_height_cm
 Min.   :1.000   Female :29971   Asian           : 9713   Min.   : 12.95  
 1st Qu.:2.000   Male   :33072   Black           : 4409   1st Qu.:160.02  
 Median :3.000   Unknown:    7   Native American :  262   Median :167.64  
 Mean   :2.625                   Other           :15029   Mean   :168.01  
 3rd Qu.:3.000                   Pacific Islander: 1301   3rd Qu.:175.30  
 Max.   :5.000                   Unknown         :  440   Max.   :213.36  
 NA's   :1967                    White           :31896   NA's   :1658    
 recent_weight_kg      age           insurance         English      
 Min.   :  0.10   Min.   : 18.00   Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 60.90   1st Qu.: 45.00   1st Qu.:0.0000   1st Qu.:1.0000  
 Median : 73.30   Median : 61.00   Median :0.0000   Median :1.0000  
 Mean   : 76.73   Mean   : 58.82   Mean   :0.4122   Mean   :0.8348  
 3rd Qu.: 88.30   3rd Qu.: 73.00   3rd Qu.:1.0000   3rd

In [13]:
head(cohort_demo, n=1)
colnames(cohort_demo)

anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,gender,race,recent_height_cm,recent_weight_kg,insurance,age,English
<chr>,<dbl>,<chr>,<int>,<dttm>,<int>,<int>,<int>,<int>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
JC1000116,131066472308,2015-01-28 00:46:00+00:00,0,2015-01-28 00:46:00,2015,1,19328596,3,Female,Other,154,73.05,1,38,0


In [14]:
# save file cohort_demo
write.csv(cohort_demo, file = file.path(outdir, "3_coh2_demo.csv"), row.names=FALSE) 