## Descriptions:
- Query other tables on shc
- Some bigger queries cannot be done in this notebook. use separate SQL file in python instead

**Inputs**:  
- `1_4_cohort_diff_full_features`: contains cohort with most difference pdiff >= 0.3
    - 24hrpreadmit: 324 total
    - all: 318 total
    
**Outputs**: 


### Importing R libraries

In [22]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Set up and run queries

In [2]:
# CREDENTIALS depending on LOCATIONS:
# credential <- paste0("/home/", "minh084", "/.config/gcloud/application_default_credentials.json")

# local computer
# credential <- "C:/Users/User/AppData/Roaming/gcloud/application_default_credentials.json"

# Nero onprem
# credential <- "/home/minh084/.config/gcloud/application_default_credentials.json"

# Nero gcp notebook
credential <- "/home/jupyter/.config/gcloud/application_default_credentials.json"

project_id <- "som-nero-phi-jonc101"

Sys.setenv(GOOGLE_APPLICATION_CREDENTIALS = credential)
Sys.setenv(GCLOUD_PROJECT = project_id)
gargle::credentials_app_default()

NULL

In [3]:
library(DBI)
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = "shc_core" #, billing = project_id
)
con 
dbListTables(con)

<BigQueryConnection>
  Dataset: som-nero-phi-jonc101.shc_core
  Billing: som-nero-phi-jonc101

In [4]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
# modeldir4 = "../../OutputTD/3_models/1_4_cohort"
modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [5]:
cohort <- read.csv(file.path(modeldir4preadmit, "1_4_cohort_diff_full_features.csv"))

nrow(cohort) # 69334
nrow(cohort %>% select(anon_id) %>% distinct()) # 304
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 324

# count anon_id and csn
cohort %>% group_by(abs_diff0_24) %>% 
            summarise(nrows = n(), count_csn = n_distinct(pat_enc_csn_id_coded), count_mrn = n_distinct(anon_id)) %>% 
            arrange(-abs_diff0_24) %>% mutate(cum_csn = cumsum(count_csn))

abs_diff0_24,nrows,count_csn,count_mrn,cum_csn
<dbl>,<int>,<int>,<int>,<int>
0.7,631,4,4,4
0.6,2218,10,10,14
0.5,6379,23,21,37
0.4,17906,83,82,120
0.3,42200,204,203,324


In [6]:
head(cohort, 1)
colnames(cohort)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,abs_diff0_24,diff0_True,diff24_True,feature_type,features,values,time,hr_before_admit
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>
1,JCcd3b9d,131267453587,2019-04-06 23:15:00,1,1,1,0.5606636,0.2180124,0.3,-0.44,-0.78,Procedures,LIMITED ULTRASOUND- CARDIAC TRANSTHORACIC ECHO,1,,


### ADT table

In [20]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, c.admit_time, c.first_label, c.death_24hr_recent_label,
        c.death_24hr_max_label, c.pred_first, c.pred_death_24hr_recent,
    adt.event_type_c, adt.event_type, adt.pat_class_c, adt.pat_class, adt.base_pat_class_c, adt.base_pat_class,
    adt.pat_service_c, adt.pat_service, adt.pat_lvl_of_care_c, adt.pat_lv_of_care, adt.accommodation_c, adt.accomodation, 
    adt.in_event_type_c, adt.in_event_type, adt.out_event_type_c, adt.out_event_type, adt.from_base_class_c, adt.from_base_class,
    adt.to_base_class_c, adt.to_base_class, adt.seq_num_in_enc, adt.seq_num_in_bed_min
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.adt` as adt
ON 
    (c.anon_id = adt.anon_id and c.pat_enc_csn_id_coded = adt.pat_enc_csn_id_coded)
"
# submit the query:
update.packages('Rcpp')
df <- dbGetQuery(con, q)
dim(df)
colnames(df)

“NAs produced by integer overflow”


In [21]:
write.csv(df, file.path(datadir, "4_1_diff_adt.csv"), row.names=FALSE)

In [23]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [24]:
write.csv(x1, file.path(datadir, "4_1_diff_demographic.csv"), row.names=FALSE)

In [41]:
# this doesn't seem right, redo with a separate SQL file as it is too large to be in R notebook
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.line, x.dx_name, x.chronic, x.principal, x.hospital_pl, x.ed, x.present_on_adm
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (x.primary = 'Y')
"
# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

Auto-refreshing stale OAuth token.

“NAs produced by integer overflow”


In [42]:
write.csv(x2, file.path(datadir, "4_1_diff_dx_primary.csv"), row.names=FALSE)

### Read the queried file back

In [10]:
adt <- read.csv(file.path(datadir, "4_1_diff_adt.csv"))
dim(adt) # should have 1385721
colnames(adt)

In [12]:
head(adt, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,event_type_c,event_type,pat_class_c,pat_class,base_pat_class_c,base_pat_class,pat_service_c,pat_service,pat_lvl_of_care_c,pat_lv_of_care,accommodation_c,accomodation,in_event_type_c,in_event_type,out_event_type_c,out_event_type,from_base_class_c,from_base_class,to_base_class_c,to_base_class,seq_num_in_enc,seq_num_in_bed_min
Unnamed: 0_level_1,<chr>,<lgl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>
1,JC2a1bd68,,2019-12-24 23:20:00,1,1,1,0.5241561,0.2500841,1,Admission,112,Emergency Services,3,Emergency,100,Emergency,,,10001,Ward,1,Admission,,,0,,3,Emergency,1,1


In [None]:
cohort %>% distinct(pat_enc_csn_id_coded, first_label, death_24hr_recent_label) %>% count(first_label, death_24hr_recent_label, sort=T)
adt %>% count(event_type, event_type_c, sort=TRUE)
adt %>% count(pat_class, pat_class_c, sort=TRUE)
adt %>% count(base_pat_class, base_pat_class_c, sort=TRUE)
adt %>% count(pat_service, pat_service_c, sort=TRUE)
adt %>% count(pat_lv_of_care, pat_lvl_of_care_c, sort=TRUE)
adt %>% count(accomodation, accommodation_c, sort=TRUE)
adt %>% count(in_event_type, in_event_type_c, sort=TRUE)
adt %>% count(out_event_type, out_event_type_c, sort=TRUE)
adt %>% count(from_base_class, from_base_class_c, sort=TRUE)
adt %>% count(to_base_class, to_base_class_c, sort=TRUE)
adt %>% count(seq_num_in_enc, sort=TRUE)
adt %>% count(seq_num_in_bed_min, sort=TRUE)

first_label,death_24hr_recent_label,n
<int>,<int>,<int>
1,1,119
1,0,98
0,0,90
0,1,17


event_type,event_type_c,n
<chr>,<int>,<int>
Census,6,419221
Transfer In,3,305337
Transfer Out,4,305337
Patient Update,5,218436
Admission,1,69334
Discharge,2,68056


pat_class,pat_class_c,n
<chr>,<int>,<int>
Inpatient,126,1182999
Emergency Services,112,200344
Observation,128,1727
OP Surgery/Procedure,122,651


base_pat_class,base_pat_class_c,n
<chr>,<int>,<int>
,,1246039
Emergency,3.0,69334
Inpatient,1.0,69334
Outpatient,2.0,1014


pat_service,pat_service_c,n
<chr>,<int>,<int>
Critical Care,151.0,283991
Medicine,39.0,191213
Emergency,100.0,141033
Emergency Medicine,187.0,120906
General Medicine (University),153.0,87235
Trauma,72.0,86612
General Surgery,59.0,81189
Cardiology,155.0,61527
General Medicine (PAMF),154.0,30827
Neurosurgery,62.0,30067


pat_lv_of_care,pat_lvl_of_care_c,n
<chr>,<int>,<int>
Acute Care (Assessment or intervention q4-8),5.0,561858
Critical Care,8.0,404635
IICU/Intermediate Care (Assessment or intervention q2-4),9.0,218827
,,200282
Newborn Nursery - VC Only,68.0,119


accomodation,accommodation_c,n
<chr>,<int>,<int>
Ward,10001,563973
Private,1,556328
Semi-Private,2,265420


in_event_type,in_event_type_c,n
<chr>,<int>,<int>
Census,6.0,419221
,,373393
Transfer In,3.0,305085
Patient Update,5.0,148340
Admission,1.0,139682


out_event_type,out_event_type_c,n
<chr>,<int>,<int>
Census,6.0,419221
,,374671
Transfer Out,4.0,305085
Patient Update,5.0,148340
Discharge,2.0,138404


from_base_class,from_base_class_c,n
<chr>,<int>,<int>
Inpatient,1,1114462
Emergency,3,200344
,0,69334
,4,930
Outpatient,2,651


to_base_class,to_base_class_c,n
<chr>,<int>,<int>
Inpatient,1,1115775
Emergency,3,200092
,0,68056
,4,930
Outpatient,2,868


seq_num_in_enc,n
<int>,<int>
1,69334
2,69334
3,69334
4,69334
5,69334
6,68365
7,67843
8,67487
9,66696
10,64255


seq_num_in_bed_min,n
<int>,<int>
1,1369116
2,16292
3,313


In [23]:
demo <- read.csv(file.path(datadir, "4_1_diff_demographic.csv"))
dim(demo) # should have 69334
colnames(demo)

In [24]:
head(demo,1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,INTRPTR_NEEDED_YN,CHARLSON_SCORE,N_HOSPITALIZATIONS,DAYS_IN_HOSPITAL
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<int>,<int>,<int>
1,JCd5f913,,False,11,14,96


In [40]:
dx <- read.csv(file.path(datadir, "4_1_diff_dx_all.csv"))
dim(dx) # should have 67800
colnames(dx)

In [39]:
head(dx_pri)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,line,dx_name,chronic,principal,hospital_pl,ed,present_on_adm
Unnamed: 0_level_1,<chr>,<lgl>,<int>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<lgl>
1,JCd1d133,,2,Myxedema coma (CMS-HCC),N,,,N,
2,JCe97743,,1,Acute kidney injury (nontraumatic) (CMS-HCC),N,,,N,
3,JCe4a606,,21,"Acute respiratory failure, unspecified whether with hypoxia or hypercapnia (CMS-HCC)",N,,,N,
4,JCd6b858,,2,"Atrial tachycardia, paroxysmal (CMS-HCC)",N,,,N,
5,JCdb215e,,1,Diabetic ketoacidosis without coma associated with diabetes mellitus due to underlying condition (CMS-HCC),N,,,Y,
6,JCe4cc9a,,1,"Fx humeral neck, left, closed, initial encounter",N,,,Y,


In [38]:
coh_dxpri <- cohort %>% filter(abs_diff0_24 >= 0.3) %>% left_join(dx_pri)
nrow(coh_dxpri)
length(unique(coh_dxpri$pat_enc_csn_id_coded))
length(unique(dx_pri$pat_enc_csn_id_coded))
length(setdiff(cohort$pat_enc_csn_id_coded, dx_pri$pat_enc_csn_id_coded))


Joining, by = c("anon_id", "pat_enc_csn_id_coded")

