## Descriptions:
- Query other tables on shc
- Some bigger queries cannot be done in this notebook. use separate SQL file in python instead
- Check the characteristics of the transfer cohort, compared to the full cohort

**Inputs**:  1120 total transfers
    
**Outputs**: 


### Importing R libraries

In [3]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

### Set up and run queries

In [2]:
# CREDENTIALS depending on LOCATIONS:
# credential <- paste0("/home/", "minh084", "/.config/gcloud/application_default_credentials.json")

# local computer
# credential <- "C:/Users/User/AppData/Roaming/gcloud/application_default_credentials.json"

# Nero onprem
# credential <- "/home/minh084/.config/gcloud/application_default_credentials.json"

# Nero gcp notebook
credential <- "/home/jupyter/.config/gcloud/application_default_credentials.json"

project_id <- "som-nero-phi-jonc101"

Sys.setenv(GOOGLE_APPLICATION_CREDENTIALS = credential)
Sys.setenv(GCLOUD_PROJECT = project_id)
gargle::credentials_app_default()

NULL

In [3]:
library(DBI)
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = "shc_core" #, billing = project_id
)
con 
dbListTables(con)

<BigQueryConnection>
  Dataset: som-nero-phi-jonc101.shc_core
  Billing: som-nero-phi-jonc101

In [4]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
modeldir4 = "../../OutputTD/3_models/1_4_cohort"
# modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [5]:
# cohort transfers
cohort <- read.csv(file.path(cohortdir, "1_4_cohort.csv")) %>% filter(first_label != death_24hr_max_label)

nrow(cohort) # 1120
nrow(cohort %>% select(anon_id) %>% distinct()) # 1085
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 1123

In [6]:
head(cohort, 1)
colnames(cohort)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>
1,JCdf010a,131282861801,45762025,2020-03-04 03:38:00+00:00,1,1,0,1,0,1,1,0,0,1,0,1,0


### ADT table

In [25]:
# full cohort
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [26]:
x1 %>% count(INTRPTR_NEEDED_YN) %>% mutate(perc = 100* n/nrow(x1))

INTRPTR_NEEDED_YN,n,perc
<lgl>,<int>,<dbl>
False,37817,85.98681219
True,6121,13.91768986
,42,0.09549795


In [26]:
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
WHERE
    c.first_label != c.death_24hr_recent_label
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [28]:
x1 %>% count(INTRPTR_NEEDED_YN) %>% mutate(perc = 100* n/nrow(x1))

INTRPTR_NEEDED_YN,n,perc
<lgl>,<int>,<dbl>
False,2166,86.67466987
True,332,13.28531413
,1,0.04001601


In [6]:
# all dx from 1_4_cohort, SQL query
dx <- read.csv(file.path(datadir, "4_1_dx_all.csv"))
nrow(dx)
length(unique(dx$pat_enc_csn_id_coded)) # full cohort is 43980

In [7]:
head(dx)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JCd61259,131279859716,35,Presence of prosthetic heart valve,,,,,,Exempt from POA reporting
2,JCd679d9,131064901435,34,Postsurgical percutaneous transluminal coronary angioplasty status,,,,,,Exempt from POA reporting
3,JCd58338,131243342034,30,Failed kidney transplant,N,N,,,N,
4,JCd39c23,131240769940,40,Other disorders of bilirubin metabolism,,,,,,No
5,JCde0d0a,131190623142,36,Other artificial openings of urinary tract status,,,,,,Exempt from POA reporting
6,JCdda759,131219479387,30,Long term (current) use of oral hypoglycemic drugs,,,,,,Exempt from POA reporting


In [8]:
dx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/43932, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Other long term (current) drug therapy,12409,28.25
2,"Hyperlipidemia, unspecified",11532,26.25
3,Essential (primary) hypertension,10691,24.34
4,Personal history of nicotine dependence,9969,22.69
5,Long term (current) use of aspirin,7188,16.36
6,Gastro-esophageal reflux disease without esophagitis,6725,15.31
7,"Acute kidney failure, unspecified",6538,14.88
8,"Major depressive disorder, single episode, unspecified",5719,13.02
9,Long term (current) use of anticoagulants,5456,12.42
10,Atherosclerotic heart disease of native coronary artery without angina pectoris,5429,12.36


In [9]:
# dx present in the ED and identified as the primary problems
dx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/43932, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Suicidal ideation,407,0.93
2,Pneumonia due to organism,302,0.69
3,"Sepsis, due to unspecified organism",277,0.63
4,Hyponatremia,274,0.62
5,Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),227,0.52
6,SBO (small bowel obstruction) (CMS-HCC),222,0.51
7,Acute chest pain,211,0.48
8,Small bowel obstruction (CMS-HCC),205,0.47
9,Neutropenic fever (CMS-HCC),194,0.44
10,Acute GI bleeding,190,0.43


In [14]:
1085/1120

In [12]:
# cohort transfers with diagnoses
dx_tx <- left_join(cohort, dx)
nrow(dx_tx)
length(unique(dx_tx$pat_enc_csn_id_coded)) #1120
length(unique(dx_tx$anon_id)) # 1085
dx_tx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/1120, 2)) %>% arrange(-n) %>% head(20)

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Hyperlipidemia, unspecified",348,31.07
2,Other long term (current) drug therapy,335,29.91
3,Personal history of nicotine dependence,306,27.32
4,Acidosis,291,25.98
5,"Acute kidney failure, unspecified",288,25.71
6,Essential (primary) hypertension,284,25.36
7,Severe sepsis with septic shock,253,22.59
8,Long term (current) use of aspirin,249,22.23
9,"Sepsis, unspecified organism",233,20.8
10,Atherosclerotic heart disease of native coronary artery without angina pectoris,217,19.38


In [11]:
dx_tx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/1120, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Sepsis, due to unspecified organism",22,1.96
2,Pneumonia due to organism,16,1.43
3,Hypoxia,12,1.07
4,Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),12,1.07
5,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",12,1.07
6,Severe sepsis (CMS-HCC),11,0.98
7,Acute chest pain,7,0.62
8,Acute pyelonephritis,7,0.62
9,Hyponatremia,7,0.62
10,Acute coronary syndrome (CMS-HCC),6,0.54
