## Descriptions:
Process 2 variables from demograhics table
- Process language --> English: yes/no
- Process insurance (medicare/caid/cal/mcal/mcare/na) --> yes/no

**Inputs**: 
- `6_2_cohort`: updated 6_1_cohort_validation with inc/exc criteria, which also has demographics information

**Outputs**: 
- `6_4_coh2_demo`

### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)
# library(mice)
# library(VIM) # for missing data plot

# library(data.table)
# library(Matrix)
# library(caret) # import this before glmnet to avoid rlang version problem
# library(glmnet)
# library(bit64)

# library(slam)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




### Demographic variables
- Age (only >=18): done in in/ex criteria
- insurance = Medical/care/caid and n/a insurance --> medis =1 otherwise 0
- language --> Engl = 1 for English, otherwise 0
- leave height and weight to be processed with flowsheet age (>=18 only)

In [3]:
# datadir = "../../DataTD"
valdir = "../../OutputTD/6_validation"
# featuredir = "../../OutputTD/2_features"

cohort <- read.csv(file.path(valdir, "6_2_cohort.csv"))
nrow(cohort) #45794
summary(cohort) 

   anon_id          pat_enc_csn_id_coded admit_time_jittered     label       
 Length:17131       Min.   :1.313e+11    Length:17131        Min.   :0.0000  
 Class :character   1st Qu.:1.313e+11    Class :character    1st Qu.:0.0000  
 Mode  :character   Median :1.313e+11    Mode  :character    Median :0.0000  
                    Mean   :1.313e+11                        Mean   :0.1388  
                    3rd Qu.:1.313e+11                        3rd Qu.:0.0000  
                    Max.   :1.313e+11                        Max.   :1.0000  
                                                                             
  admit_time           adm_year      adm_month      inpatient_data_id_coded
 Length:17131       Min.   :2020   Min.   : 1.000   Min.   :51823319       
 Class :character   1st Qu.:2020   1st Qu.: 4.000   1st Qu.:57106984       
 Mode  :character   Median :2021   Median : 7.000   Median :62110309       
                    Mean   :2021   Mean   : 6.612   Mean   :62392235    

In [4]:
head(cohort, n=1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,hosp_admsn_time,ed_time_hr,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<int>
1,JC1000116,131295313275,2020-09-29 22:45:00+00:00,0,2020-09-29 22:45:00,2020,9,57868578,3,2020-09-29 16:02:00,6.716667,Female,Other,Spanish,154,73.05,HPSM,44


In [5]:
cohort <- cohort %>% select(-hosp_admsn_time, -ed_time_hr) %>% # hosp_admsn_time = ymd_hms(hosp_admsn_time)
                mutate(admit_time = ymd_hms(admit_time), # recent_date = ymd(recent_date), 
                       gender = factor(gender), race = factor(race), language = factor(language), insurance = factor(insurance)) 
                
nrow(cohort)
summary(cohort)

   anon_id          pat_enc_csn_id_coded admit_time_jittered     label       
 Length:17131       Min.   :1.313e+11    Length:17131        Min.   :0.0000  
 Class :character   1st Qu.:1.313e+11    Class :character    1st Qu.:0.0000  
 Mode  :character   Median :1.313e+11    Mode  :character    Median :0.0000  
                    Mean   :1.313e+11                        Mean   :0.1388  
                    3rd Qu.:1.313e+11                        3rd Qu.:0.0000  
                    Max.   :1.313e+11                        Max.   :1.0000  
                                                                             
   admit_time                     adm_year      adm_month     
 Min.   :2020-04-01 00:45:00   Min.   :2020   Min.   : 1.000  
 1st Qu.:2020-09-04 19:37:00   1st Qu.:2020   1st Qu.: 4.000  
 Median :2021-01-22 21:46:00   Median :2021   Median : 7.000  
 Mean   :2021-01-17 13:46:14   Mean   :2021   Mean   : 6.612  
 3rd Qu.:2021-06-07 08:53:30   3rd Qu.:2021   3rd Qu.: 9.000 

In [6]:
# checking duplicates
nrow(cohort %>% select(anon_id) %>% distinct()) # 31511
nrow(cohort %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 45794
cohort[duplicated(cohort[, c('anon_id','pat_enc_csn_id_coded')]),]

# cohort %>% filter(anon_id == "JCd49287") # pat_enc_csn_id_coded = 131195706986, 
# cohort <- cohort %>% filter (!(anon_id == "JCd49287" & race == "Unknown"))

anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
<chr>,<dbl>,<chr>,<int>,<dttm>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<int>


In [7]:
summary(cohort %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age))
cohort %>% gather(var, value) %>% distinct() %>% count(var) %>% arrange(n)
cohort %>% summarise_each(funs(n_distinct))

      ESI          gender                   race      recent_height_cm
 Min.   :1.00   Female:8058   Asian           :2788   Min.   :113.0   
 1st Qu.:2.00   Male  :9073   Black           :1109   1st Qu.:160.0   
 Median :3.00                 Native American :  64   Median :167.6   
 Mean   :2.55                 Other           :4522   Mean   :168.0   
 3rd Qu.:3.00                 Pacific Islander: 366   3rd Qu.:175.3   
 Max.   :5.00                 Unknown         :  87   Max.   :213.4   
 NA's   :98                   White           :8195   NA's   :387     
 recent_weight_kg      age        
 Min.   : 26.76   Min.   : 18.00  
 1st Qu.: 61.23   1st Qu.: 46.00  
 Median : 74.00   Median : 62.00  
 Mean   : 77.21   Mean   : 59.72  
 3rd Qu.: 88.50   3rd Qu.: 74.00  
 Max.   :342.46   Max.   :107.00  
 NA's   :30                       

“attributes are not identical across measure variables;
they will be dropped”


var,n
<chr>,<int>
adm_year,2
gender,2
label,2
ESI,6
race,7
adm_month,12
language,47
age,87
insurance,91
recent_height_cm,573


“`summarise_each_()` was deprecated in dplyr 0.7.0.
Please use `across()` instead.
“`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,gender,race,language,recent_height_cm,recent_weight_kg,insurance,age
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
12993,17131,16867,2,16867,2,12,17131,6,2,7,47,573,2483,91,87


### Insurance:

In [8]:
options(repr.matrix.max.rows=135, repr.matrix.max.cols=20)
cohort %>%                 # filter(!is.na(col)) %>% filter out all rows with NAs in col 
                group_by(insurance) %>% count() %>% arrange(desc(n))

insurance,n
<fct>,<int>
MEDICARE,6059
HPSM,1280
BLUE CROSS,1239
,986
MEDI-CAL,737
BLUE SHIELD,690
SANTA CLARA FAMILY HP,666
UNITED HEALTHCARE,585
AETNA,493
CIGNA,402


In [9]:
# check for medicare/ medical/ no insurance
# cohort %>% filter(str_detect(insurance, "MEDI") | insurance == "") %>% 
#                 group_by(insurance) %>% count() %>% arrange(-n)

# mcal = medical, mcare = medicare
medis <- c("MEDI", "MCAL", "MCARE")
cohort %>% filter(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "") %>% 
                group_by(insurance) %>% count() %>% arrange(-n)

insurance,n
<fct>,<int>
MEDICARE,6059
,986
MEDI-CAL,737
HPSM - MCARE ADV,362
ALAMEDA ALLIANCE MCAL MGD CARE,354
MCAL HMO - CENTRAL CALIFORNIA ALLIANCE FOR HEALTH,340
UHC MEDICARE AARP COMPLETE,186
BLUE CROSS MEDI-CAL,178
SCFHP - MCARE ADV,146
MEDICARE SENIOR ADVANTAGE - OTHER,90


In [10]:
# turn insurance into medis which has "MEDI" under insurance
# insurance = 0 if patient has medis or no insurance, 1 for other insurance
medis <- c("MEDI", "MCAL", "MCARE")

nrow(cohort %>% filter(insurance == "")) # 2737
nrow(cohort %>% filter(str_detect(insurance, "MEDI"))) # 20766
nrow(cohort %>% filter(str_detect(insurance, paste(medis, collapse = "|")))) # 23987

cohort_demo <- cohort %>%
                    mutate(insurance = ifelse(str_detect(insurance, paste(medis, collapse = "|")) | insurance == "", 0, 1)) %>%
                    group_by(anon_id, pat_enc_csn_id_coded) %>%
                    mutate(insurance = sum(insurance)) %>% ungroup() %>% # just to make sure if anyone else has more than 1
                    mutate(insurance = ifelse(insurance>0, 1, insurance)) 

cohort_demo %>% count(insurance)

insurance,n
<dbl>,<int>
0,9812
1,7319


### Language:

In [11]:
cohort_demo %>% group_by(language) %>% count() %>% arrange(desc(n))

language,n
<fct>,<int>
English,13964
Spanish,1712
Mandarin,264
Vietnamese,250
Cantonese,120
Russian,98
Tongan,89
Farsi,85
Tagalog,83
Dari,63


In [12]:
cohort_demo <- cohort_demo %>%
                    mutate(English = ifelse(language == "English", 1, 0)) %>% 
                    select(-c(language))

cohort_demo %>% count(English)
nrow(cohort_demo)
length(unique(cohort_demo$anon_id))
summary(cohort_demo %>% select(ESI, gender, race, recent_height_cm, recent_weight_kg, age, insurance, English))

English,n
<dbl>,<int>
0,3167
1,13964


      ESI          gender                   race      recent_height_cm
 Min.   :1.00   Female:8058   Asian           :2788   Min.   :113.0   
 1st Qu.:2.00   Male  :9073   Black           :1109   1st Qu.:160.0   
 Median :3.00                 Native American :  64   Median :167.6   
 Mean   :2.55                 Other           :4522   Mean   :168.0   
 3rd Qu.:3.00                 Pacific Islander: 366   3rd Qu.:175.3   
 Max.   :5.00                 Unknown         :  87   Max.   :213.4   
 NA's   :98                   White           :8195   NA's   :387     
 recent_weight_kg      age           insurance         English      
 Min.   : 26.76   Min.   : 18.00   Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 61.23   1st Qu.: 46.00   1st Qu.:0.0000   1st Qu.:1.0000  
 Median : 74.00   Median : 62.00   Median :0.0000   Median :1.0000  
 Mean   : 77.21   Mean   : 59.72   Mean   :0.4272   Mean   :0.8151  
 3rd Qu.: 88.50   3rd Qu.: 74.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :342.46   

In [13]:
head(cohort_demo, n=1)
colnames(cohort_demo)

anon_id,pat_enc_csn_id_coded,admit_time_jittered,label,admit_time,adm_year,adm_month,inpatient_data_id_coded,ESI,gender,race,recent_height_cm,recent_weight_kg,insurance,age,English
<chr>,<dbl>,<chr>,<int>,<dttm>,<int>,<int>,<int>,<int>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
JC1000116,131295313275,2020-09-29 22:45:00+00:00,0,2020-09-29 22:45:00,2020,9,57868578,3,Female,Other,154,73.05,1,44,0


In [16]:
# save file cohort_demo
write.csv(cohort_demo, file = file.path(valdir, "6_3_coh2_demo.csv"), row.names=FALSE) 