## Descriptions:
process variables from demograhics table
- Process language --> English: yes/no
- Process insurance (medicare/caid/cal/mcal/mcare/na) --> yes/no

**Inputs**: 
- *cohort1_criteria*: updated cohort0 with inc/exc criteria
- *demographics*

**Outputs**: 

In [1]:
library(caret) # import this before glmnet to avoid rlang version problem
library(xgboost)
library(data.table)
library(tidyverse)
library(lubridate)
library(Matrix)
# library(slam)
library(glmnet)
library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
library(mice)
options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

Loading required package: lattice

Loading required package: ggplot2

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mlift()[39m      mas

### Demographic variables
- Age (only >=18): done in in/ex criteria
- insurance = Medical/care/caid and n/a insurance --> medis =1 otherwise 0
- language --> Engl = 1 for English, otherwise 0
- leave height and weight to be processed with flowsheet age (>=18 only)

In [4]:
datadir = "../../DataTD"

cohort_demo <- read.csv(file.path(datadir, "cohort1_criteria.csv"))
nrow(cohort_demo)
summary(cohort_demo)

   anon_id          pat_enc_csn_id_coded     label        admit_time       
 Length:45986       Min.   :1.311e+11    Min.   :0.000   Length:45986      
 Class :character   1st Qu.:1.312e+11    1st Qu.:0.000   Class :character  
 Mode  :character   Median :1.312e+11    Median :0.000   Mode  :character  
                    Mean   :1.312e+11    Mean   :0.138                     
                    3rd Qu.:1.313e+11    3rd Qu.:0.000                     
                    Max.   :1.313e+11    Max.   :1.000                     
                                                                           
 inpatient_data_id_coded      ESI        hosp_admsn_time      ed_time_hr     
 Min.   :13616753        Min.   :1.000   Length:45986       Min.   :  0.000  
 1st Qu.:19863571        1st Qu.:2.000   Class :character   1st Qu.:  2.217  
 Median :27889292        Median :3.000   Mode  :character   Median :  3.483  
 Mean   :28152729        Mean   :2.655                      Mean   :  5.015  
 3

In [3]:
head(cohort_demo, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<int>
1,JCcb7792,131236214276,1,2017-06-23 22:27:00+00:00,25347698,2,2017-06-23 20:59:00+00:00,1.466667,Female,White,English,,,CIGNA,2020-03-03,62


In [4]:
cohort_demo <- cohort_demo %>% 
                mutate(admit_time = ymd_hms(admit_time), hosp_admsn_time = ymd_hms(hosp_admsn_time), recent_date = ymd(recent_date), 
                       gender = factor(gender), race = factor(race), language = factor(language), insurance = factor(insurance))
nrow(cohort_demo)
summary(cohort_demo)

   anon_id          pat_enc_csn_id_coded     label      
 Length:45986       Min.   :1.311e+11    Min.   :0.000  
 Class :character   1st Qu.:1.312e+11    1st Qu.:0.000  
 Mode  :character   Median :1.312e+11    Median :0.000  
                    Mean   :1.312e+11    Mean   :0.138  
                    3rd Qu.:1.313e+11    3rd Qu.:0.000  
                    Max.   :1.313e+11    Max.   :1.000  
                                                        
   admit_time                  inpatient_data_id_coded      ESI       
 Min.   :2015-01-01 08:24:00   Min.   :13616753        Min.   :1.000  
 1st Qu.:2016-05-22 19:15:45   1st Qu.:19863571        1st Qu.:2.000  
 Median :2017-12-31 03:18:00   Median :27889292        Median :3.000  
 Mean   :2017-10-13 04:34:25   Mean   :28152729        Mean   :2.655  
 3rd Qu.:2019-02-19 03:23:45   3rd Qu.:34369529        3rd Qu.:3.000  
 Max.   :2020-03-31 07:00:00   Max.   :48026130        Max.   :5.000  
                                               

In [5]:
# checking duplicates
nrow(cohort_demo %>% select(anon_id) %>% distinct()) # 30073
nrow(cohort_demo %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 43524
cohort_demo[duplicated(cohort_demo[, c('anon_id','pat_enc_csn_id_coded')]),]

# cohort_demo %>% filter(anon_id == "JCd49287") # pat_enc_csn_id_coded = 131195706986, 
# cohort_demo <- cohort_demo %>% filter (!(anon_id == "JCd49287" & race == "Unknown"))

anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
<chr>,<dbl>,<int>,<dttm>,<int>,<dbl>,<dttm>,<dbl>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<date>,<int>


In [6]:
summary(cohort_demo %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age))
cohort_demo %>% gather(var, value) %>% distinct() %>% count(var) %>% arrange(n)
cohort_demo %>% summarise_each(funs(n_distinct))

      ESI           gender                    race       recent_height_cm
 Min.   :1.000   Female:21997   Asian           : 6906   Min.   : 12.95  
 1st Qu.:2.000   Male  :23989   Black           : 3279   1st Qu.:160.02  
 Median :3.000                  Native American :  194   Median :167.64  
 Mean   :2.655                  Other           :10554   Mean   :167.97  
 3rd Qu.:3.000                  Pacific Islander:  943   3rd Qu.:175.26  
 Max.   :5.000                  Unknown         :  426   Max.   :210.82  
 NA's   :1861                   White           :23684   NA's   :1420    
 recent_weight_kg      age       
 Min.   :  0.10   Min.   :18.00  
 1st Qu.: 60.90   1st Qu.:44.00  
 Median : 73.10   Median :60.00  
 Mean   : 76.61   Mean   :58.28  
 3rd Qu.: 88.00   3rd Qu.:73.00  
 Max.   :320.69   Max.   :90.00  
 NA's   :311                     

“attributes are not identical across measure variables;
they will be dropped”


var,n
<chr>,<int>
gender,2
label,2
ESI,6
race,7
language,55
age,73
insurance,134
recent_date,457
recent_height_cm,790
ed_time_hr,2076


“`summarise_each_()` was deprecated in dplyr 0.7.0.
Please use `across()` instead.
“`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,recent_date,age
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
31584,45986,2,45451,45986,6,45473,2076,2,7,55,790,3448,134,457,73


### Insurance:

In [12]:
options(repr.matrix.max.rows=135, repr.matrix.max.cols=20)
cohort_demo %>%                 # filter(!is.na(col)) %>% filter out all rows with NAs in col 
                group_by(insurance) %>% count() %>% arrange(desc(n))

insurance,n
<fct>,<int>
MEDICARE,17032
BLUE CROSS,3404
HPSM,3141
,2737
MEDI-CAL,2067
BLUE SHIELD,1900
SANTA CLARA FAMILY HP,1752
UNITED HEALTHCARE,1581
AETNA,1230
HPSM - MCARE ADV,1046


In [14]:
# medi--- and no insurance
cohort_demo %>% filter(str_detect(insurance, "MEDI") | insurance == "") %>% 
                group_by(insurance) %>% count() %>% arrange(-n)

# mcal = medical, mcare = medicare
medis <- c("MEDI", "MCAL", "MCARE")
cohort_demo %>% filter(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "") %>% 
                group_by(insurance) %>% count() %>% arrange(-n)

insurance,n
<fct>,<int>
MEDICARE,17032
,2737
MEDI-CAL,2067
BLUE CROSS MEDI-CAL,419
UHC MEDICARE AARP COMPLETE,388
SNF HOSPICE MEDICARE (HOSP ONLY),324
MEDICARE SENIOR ADVANTAGE - OTHER,193
MEDI-CAL MANAGED CARE,184
MEDICAID,80
MEDI-CAL CCS,31


insurance,n
<fct>,<int>
MEDICARE,17032
,2737
MEDI-CAL,2067
HPSM - MCARE ADV,1046
MCAL HMO - CENTRAL CALIFORNIA ALLIANCE FOR HEALTH,690
ALAMEDA ALLIANCE MCAL MGD CARE,656
BLUE CROSS MEDI-CAL,419
UHC MEDICARE AARP COMPLETE,388
SCFHP - MCARE ADV,368
SNF HOSPICE MEDICARE (HOSP ONLY),324


In [27]:
# turn insurance into medis which has "MEDI" under insurance
# 0 for medis and no insurance, 1 for other insurance
medis <- c("MEDI", "MCAL", "MCARE")

nrow(cohort_demo %>% filter(insurance == "")) # 2737
nrow(cohort_demo %>% filter(str_detect(insurance, "MEDI"))) # 20766
nrow(cohort_demo %>% filter(str_detect(insurance, paste(medis, collapse = "|")))) # 23987

cohort_demo <- cohort_demo %>%
                    mutate(insurance = ifelse(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "", 0, 1)) %>%
                    group_by(anon_id, pat_enc_csn_id_coded) %>%
                    mutate(insurance = sum(insurance)) %>% ungroup() %>% # just to make sure if anyone else has more than 1
                    mutate(insurance = ifelse(insurance>0, 1, insurance)) 

cohort_demo %>% count(insurance)

insurance,n
<dbl>,<int>
0,26724
1,19262


### Language:

In [28]:
cohort_demo %>% group_by(language) %>% count() %>% arrange(desc(n))

language,n
<fct>,<int>
English,38767
Spanish,3688
Mandarin,748
Vietnamese,511
Cantonese,265
Russian,264
Farsi,262
Tongan,219
Tagalog,178
Korean,152


In [36]:
cohort_demo <- cohort_demo %>%
                    mutate(English = ifelse(language == "English", 1, 0)) %>% 
                    select(-c(language))

cohort_demo %>% count(English)
nrow(cohort_demo)
length(unique(cohort_demo$anon_id))
summary(cohort_demo %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age, insurance, English))

English,n
<dbl>,<int>
0,7219
1,38767


      ESI           gender                    race       recent_height_cm
 Min.   :1.000   Female:21997   Asian           : 6906   Min.   : 12.95  
 1st Qu.:2.000   Male  :23989   Black           : 3279   1st Qu.:160.02  
 Median :3.000                  Native American :  194   Median :167.64  
 Mean   :2.655                  Other           :10554   Mean   :167.97  
 3rd Qu.:3.000                  Pacific Islander:  943   3rd Qu.:175.26  
 Max.   :5.000                  Unknown         :  426   Max.   :210.82  
 NA's   :1861                   White           :23684   NA's   :1420    
 recent_weight_kg      age          insurance         English     
 Min.   :  0.10   Min.   :18.00   Min.   :0.0000   Min.   :0.000  
 1st Qu.: 60.90   1st Qu.:44.00   1st Qu.:0.0000   1st Qu.:1.000  
 Median : 73.10   Median :60.00   Median :0.0000   Median :1.000  
 Mean   : 76.61   Mean   :58.28   Mean   :0.4189   Mean   :0.843  
 3rd Qu.: 88.00   3rd Qu.:73.00   3rd Qu.:1.0000   3rd Qu.:1.000  
 Max. 

In [30]:
head(cohort_demo, n=1)
colnames(cohort_demo)

anon_id,pat_enc_csn_id_coded,label,admit_time,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,recent_height_cm,recent_weight_kg,insurance,recent_date,age,pub_insurance,English
<chr>,<dbl>,<int>,<dttm>,<int>,<int>,<dttm>,<dbl>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<date>,<int>,<dbl>,<dbl>
JCe78a06,131062667066,0,2015-01-02 01:01:00,13616753,,2015-01-01 17:10:00,7.85,Female,White,152.4,55.6,1,2020-01-10,83,0,1


In [31]:
# cohort_demo includes cohort, just more variables
write.csv(cohort_demo, file = file.path(datadir, "cohort_demo.csv"), row.names=FALSE) 