## Descriptions:
- Query other tables on shc
- Some bigger queries cannot be done in this notebook. use separate SQL file in python instead
- Check the characteristics of the transfer cohort, compared to the full cohort

**Inputs**:  
- `1_4_cohort_diff_full_features`: contains cohort with most difference pdiff >= 0.3
    - 24hrpreadmit: 324 total
    - all: 318 total
    
**Outputs**: 


### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

"package 'bigrquery' was built under R version 4.0.5"
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.0

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union




### Set up and run queries

In [2]:
# CREDENTIALS depending on LOCATIONS:
# credential <- paste0("/home/", "minh084", "/.config/gcloud/application_default_credentials.json")

# local computer
credential <- "C:/Users/User/AppData/Roaming/gcloud/application_default_credentials.json"

# Nero onprem
# credential <- "/home/minh084/.config/gcloud/application_default_credentials.json"

# Nero gcp notebook
# credential <- "/home/jupyter/.config/gcloud/application_default_credentials.json"

project_id <- "som-nero-phi-jonc101"

Sys.setenv(GOOGLE_APPLICATION_CREDENTIALS = credential)
Sys.setenv(GCLOUD_PROJECT = project_id)
gargle::credentials_app_default()

NULL

In [3]:
library(DBI)
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = "shc_core" #, billing = project_id
)
con 
dbListTables(con)

<BigQueryConnection>
  Dataset: som-nero-phi-jonc101.shc_core
  Billing: som-nero-phi-jonc101

In [4]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
modeldir4 = "../../OutputTD/3_models/1_4_cohort"
modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [5]:
cohort <- read.csv(file.path(cohortdir, "1_4_cohort.csv")) %>% filter(first_label != death_24hr_max_label)

nrow(cohort) # 1120
nrow(cohort %>% select(anon_id) %>% distinct()) # 1085
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 1123

In [6]:
head(cohort, 1)
colnames(cohort)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>
1,JCdf010a,131282861801,45762025,2020-03-04 03:38:00+00:00,1,1,0,1,0,1,1,0,0,1,0,1,0


### ADT table

In [25]:
# full cohort
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [26]:
x1 %>% count(INTRPTR_NEEDED_YN) %>% mutate(perc = 100* n/nrow(x1))

INTRPTR_NEEDED_YN,n,perc
<lgl>,<int>,<dbl>
False,37817,85.98681219
True,6121,13.91768986
,42,0.09549795


In [27]:
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
WHERE
    c.first_label != c.death_24hr_recent_label
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [28]:
x1 %>% count(INTRPTR_NEEDED_YN) %>% mutate(perc = 100* n/nrow(x1))

INTRPTR_NEEDED_YN,n,perc
<lgl>,<int>,<dbl>
False,2166,86.67466987
True,332,13.28531413
,1,0.04001601


In [39]:
# full cohort
q = "
SELECT x.dx_name, count(x.dx_name) AS n
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

GROUP by x.dx_name
"

# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

In [40]:
x2 %>% mutate(perc = round(100*n/nrow(x2), 2)) %>% arrange(-n) %>% head(20)

dx_name,n,perc
<chr>,<int>,<dbl>
Other long term (current) drug therapy,12410,26.27
"Hyperlipidemia, unspecified",11553,24.45
Essential (primary) hypertension,10717,22.68
Personal history of nicotine dependence,9969,21.1
Shortness of breath,8009,16.95
Long term (current) use of aspirin,7188,15.21
"Acute kidney failure, unspecified",6823,14.44
Gastro-esophageal reflux disease without esophagitis,6733,14.25
"Major depressive disorder, single episode, unspecified",6174,13.07
Long term (current) use of anticoagulants,5641,11.94


In [41]:
# transfer cohort
q = "
SELECT x.dx_name, count(x.dx_name) AS n
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE
    c.first_label != c.death_24hr_recent_label

GROUP by x.dx_name
"

# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

In [42]:
x2 %>% mutate(perc = round(100*n/nrow(x2), 2)) %>% arrange(-n) %>% head(20)

dx_name,n,perc
<chr>,<int>,<dbl>
"Hyperlipidemia, unspecified",795,6.8
Other long term (current) drug therapy,723,6.18
Essential (primary) hypertension,690,5.9
Personal history of nicotine dependence,598,5.12
Acidosis,549,4.7
Long term (current) use of aspirin,528,4.52
Shortness of breath,522,4.46
Other nonspecific abnormal finding of lung field,512,4.38
"Acute kidney failure, unspecified",501,4.29
Atherosclerotic heart disease of native coronary artery without angina pectoris,431,3.69


In [43]:
write.csv(x2, file.path(datadir, "4_1_tx_alldx.csv"), row.names=FALSE)

In [45]:
# full cohort
q = "
SELECT x.dx_name, count(x.dx_name) AS n
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (x.ed = 'Y')

GROUP by x.dx_name
"

# submit the query:
update.packages('Rcpp')
x3 <- dbGetQuery(con, q)
dim(x3)
colnames(x3)

In [46]:
x3 %>% mutate(perc = round(100*n/nrow(x3), 2)) %>% arrange(-n) %>% head(20)

dx_name,n,perc
<chr>,<int>,<dbl>
Hyponatremia,697,8.29
Suicidal ideation,640,7.61
Pneumonia due to organism,611,7.27
"Sepsis, due to unspecified organism",579,6.88
Acute kidney injury (nontraumatic) (CMS-HCC),565,6.72
Shortness of breath,549,6.53
Hypoxia,473,5.62
Acute chest pain,438,5.21
"Altered mental status, unspecified altered mental status type",416,4.95
Elevated troponin,390,4.64


In [None]:
# transfers cohort testing
q = "
SELECT distinct c.pat_enc_csn_id_coded, x.dx_name, x.primary, x.ed
FROM `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)
WHERE (c.first_label != c.death_24hr_recent_label and x.ed = 'Y')
"

# submit the query:
update.packages('Rcpp')
x3 <- dbGetQuery(con, q)
dim(x3)
colnames(x3)

In [None]:
# transfer cohort
q = "
SELECT x.dx_name, count(x.dx_name) AS n
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (c.first_label != c.death_24hr_recent_label and x.ed = 'Y')

GROUP by x.dx_name
"

# submit the query:
update.packages('Rcpp')
x3 <- dbGetQuery(con, q)
dim(x3)
colnames(x3)

In [6]:
update.packages('Rcpp')


In [48]:
x3 %>% mutate(perc = round(100*n/nrow(x3), 2)) %>% arrange(-n) %>% head(20)

dx_name,n,perc
<chr>,<int>,<dbl>
"Sepsis, due to unspecified organism",55,4.5
Severe sepsis (CMS-HCC),50,4.09
Hyperkalemia,48,3.93
Hyponatremia,48,3.93
Pneumonia due to organism,47,3.85
Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),43,3.52
Septic shock (CMS-HCC),42,3.44
Hypoxia,39,3.19
Subdural hematoma (CMS-HCC),38,3.11
Acute kidney injury (nontraumatic) (CMS-HCC),38,3.11


In [49]:
# full cohort
q = "
SELECT x.dx_name, count(x.dx_name) AS n
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (x.ed = 'Y' and primary='Y')

GROUP by x.dx_name
"

# submit the query:
update.packages('Rcpp')
x4 <- dbGetQuery(con, q)
dim(x4)
colnames(x4)

In [50]:
x4 %>% mutate(perc = round(100*n/nrow(x4), 2)) %>% arrange(-n) %>% head(20)

dx_name,n,perc
<chr>,<int>,<dbl>
Suicidal ideation,407,8.79
Pneumonia due to organism,302,6.52
"Sepsis, due to unspecified organism",277,5.98
Hyponatremia,274,5.92
Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),227,4.9
SBO (small bowel obstruction) (CMS-HCC),222,4.79
Acute chest pain,211,4.56
Small bowel obstruction (CMS-HCC),205,4.43
Neutropenic fever (CMS-HCC),194,4.19
Acute GI bleeding,190,4.1


In [51]:
# transfer cohort
q = "
SELECT x.dx_name, count(x.dx_name) AS n
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (c.first_label != c.death_24hr_recent_label and x.ed = 'Y' and primary='Y')

GROUP by x.dx_name
"

# submit the query:
update.packages('Rcpp')
x4 <- dbGetQuery(con, q)
dim(x4)
colnames(x4)

In [52]:
x4 %>% mutate(perc = round(100*n/nrow(x4), 2)) %>% arrange(-n) %>% head(20)

dx_name,n,perc
<chr>,<int>,<dbl>
Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),37,6.99
Septic shock (CMS-HCC),27,5.1
"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",27,5.1
"Sepsis, due to unspecified organism",26,4.91
Hyperkalemia,25,4.73
Severe sepsis (CMS-HCC),25,4.73
Subdural hematoma (CMS-HCC),24,4.54
Pneumonia due to organism,24,4.54
Hyponatremia,20,3.78
Acute GI bleeding,19,3.59
