## Descriptions:
- Query other tables on shc to see if we can get any worthy information not included in prediction
- Some bigger queries cannot be done in this notebook. use separate SQL file in python instead
- Check the characteristics of the `transfer` cohort, compared to the full cohort

**Inputs**:
- `1_4_cohort`
- `1_4_cohort_all_current_dx`

**Outputs**: 
- from in-notebook sql: `4_1_tx_demographic`

**Note**:  
- transfers time_0 != time_24: 2499
- max!=first: 1120 total transfers; 347 from the test set

### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

"package 'bigrquery' was built under R version 4.0.5"
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.5     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.0

"package 'ggplot2' was built under R version 4.0.5"
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union




### Set up and run queries

In [52]:
# CREDENTIALS depending on LOCATIONS:
# credential <- paste0("/home/", "minh084", "/.config/gcloud/application_default_credentials.json")

# local computer
# credential <- "C:/Users/User/AppData/Roaming/gcloud/application_default_credentials.json"

# Nero onprem
# credential <- "/home/minh084/.config/gcloud/application_default_credentials.json"

# Nero gcp notebook
credential <- "/home/jupyter/.config/gcloud/application_default_credentials.json"

project_id <- "som-nero-phi-jonc101"

Sys.setenv(GOOGLE_APPLICATION_CREDENTIALS = credential)
Sys.setenv(GCLOUD_PROJECT = project_id)
gargle::credentials_app_default()

NULL

In [53]:
library(DBI)
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = "shc_core" #, billing = project_id
)
con 
dbListTables(con)

<BigQueryConnection>
  Dataset: som-nero-phi-jonc101.shc_core
  Billing: som-nero-phi-jonc101

In [2]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
modeldir4 = "../../OutputTD/3_models/1_4_cohort"
# modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"
tabledir = "../../OutputTD/4_tables"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [3]:
cohort4 <- read.csv(file.path(cohortdir, "1_4_cohort.csv")) 
nrow(cohort4)

In [4]:
# full cohort transfers
cohort <- read.csv(file.path(cohortdir, "1_4_cohort.csv")) %>% filter(first_label != death_24hr_recent_label)

nrow(cohort) # 1120max vs 2499
nrow(cohort %>% select(anon_id) %>% distinct()) # 1085max vs  2324 
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 1123max vs 2499
nrow(cohort %>% filter(year(ymd_hms(admit_time)) > 2018) %>% select(pat_enc_csn_id_coded) %>% distinct()) # 347max  vs 805

In [5]:
# test cohort
nrow(cohort %>% filter(year(ymd_hms(admit_time)) > 2018))

# cohort and test cohort go from 1 -> 0
nrow(cohort %>% filter(first_label == 1))
nrow(cohort %>% filter(first_label == 1, year(ymd_hms(admit_time)) > 2018))

# cohort and test cohort go from 0 --> 1
nrow(cohort %>% filter(first_label == 0))
nrow(cohort %>% filter(first_label == 0, year(ymd_hms(admit_time)) > 2018))

In [6]:
100 * 1587/2499
100 * 526/805

100 * 912/2499
100 * 279/805

65.34 / 34.66

In [12]:
### cohort that transfer from 0 --> 1
tx01 <- cohort %>% filter(first_label == 1, death_24hr_recent_label ==0)
nrow(tx01) # 1587 

In [7]:
head(cohort, 1)
colnames(cohort)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,inpatient_data_id_coded,admit_time,label_max24,label_24hr_recent,admit_label,has_admit_label,died_within_24hrs,death_24hr_max_label,death_24hr_recent_label,first_label,first_label_minutes_since_admit,acute_to_critical_label_recent,critical_to_acute_label_recent,acute_to_critical_label_max,critical_to_acute_label_max
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>
1,JCdf010a,131282861801,45762025,2020-03-04 03:38:00+00:00,1,1,0,1,0,1,1,0,0,1,0,1,0


### ADT and Demographic tables

In [18]:
# previously queried ADT table
adt <- read.csv(file.path(datadir, 'adt_class_serv_loc.csv'))
nrow(adt) # 2711384

In [17]:
head(adt)

Unnamed: 0_level_0,pat_class_c,pat_class,base_pat_class,pat_service,pat_lv_of_care
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>
1,120,LPCH PATIENT,,Emergency Medicine,
2,166,Bedded Outpatient (corrections only),,Emergency,
3,166,Bedded Outpatient (corrections only),Outpatient,Emergency,Acute Care (Assessment or intervention q4-8)
4,166,Bedded Outpatient (corrections only),,Emergency,Acute Care (Assessment or intervention q4-8)
5,132,To Be Admitted,Inpatient,Emergency,
6,166,Bedded Outpatient (corrections only),,Emergency,Acute Care (Assessment or intervention q4-8)


In [58]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, c.admit_time, c.first_label, c.death_24hr_recent_label, c.death_24hr_max_label,
    adt.event_type_c, adt.event_type, adt.pat_class_c, adt.pat_class, adt.base_pat_class_c, adt.base_pat_class,
    adt.pat_service_c, adt.pat_service, adt.pat_lvl_of_care_c, adt.pat_lv_of_care, adt.accommodation_c, adt.accomodation, 
    adt.in_event_type_c, adt.in_event_type, adt.out_event_type_c, adt.out_event_type, adt.from_base_class_c, adt.from_base_class,
    adt.to_base_class_c, adt.to_base_class, adt.seq_num_in_enc, adt.seq_num_in_bed_min
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.adt` as adt
ON 
    (c.anon_id = adt.anon_id and c.pat_enc_csn_id_coded = adt.pat_enc_csn_id_coded)
WHERE
    c.first_label != c.death_24hr_recent_label
"
# submit the query:
update.packages('Rcpp')
df <- dbGetQuery(con, q)
dim(df)
colnames(df)

“NAs produced by integer overflow”


In [54]:
# full cohort
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
"
# WHERE
#     c.first_label != c.death_24hr_recent_label

# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [55]:
x1 %>% count(INTRPTR_NEEDED_YN) %>% mutate(perc = 100* n/nrow(x1))

INTRPTR_NEEDED_YN,n,perc
<lgl>,<int>,<dbl>
False,37817,85.98681219
True,6121,13.91768986
,42,0.09549795


In [56]:
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
WHERE
    c.first_label != c.death_24hr_recent_label
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [57]:
x1 %>% count(INTRPTR_NEEDED_YN) %>% mutate(perc = 100* n/nrow(x1))

INTRPTR_NEEDED_YN,n,perc
<lgl>,<int>,<dbl>
False,2166,86.67466987
True,332,13.28531413
,1,0.04001601


### FULL COHORT
current visit diagnosis

In [76]:
# all dx from 1_4_cohort, SQL query
dx <- read.csv(file.path(datadir, "1_4_cohort_all_current_dx.csv"))
nrow(dx) # 1239918
ndx=length(unique(dx$pat_enc_csn_id_coded)) # full cohort is 43980
ndx

In [31]:
head(dx)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JCd61259,131279859716,35,Presence of prosthetic heart valve,,,,,,Exempt from POA reporting
2,JCd679d9,131064901435,34,Postsurgical percutaneous transluminal coronary angioplasty status,,,,,,Exempt from POA reporting
3,JCd58338,131243342034,30,Failed kidney transplant,N,N,,,N,
4,JCd39c23,131240769940,40,Other disorders of bilirubin metabolism,,,,,,No
5,JCde0d0a,131190623142,36,Other artificial openings of urinary tract status,,,,,,Exempt from POA reporting
6,JCdda759,131219479387,30,Long term (current) use of oral hypoglycemic drugs,,,,,,Exempt from POA reporting


In [77]:
dx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/ndx, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Other long term (current) drug therapy,12409,28.25
2,"Hyperlipidemia, unspecified",11532,26.25
3,Essential (primary) hypertension,10691,24.34
4,Personal history of nicotine dependence,9969,22.69
5,Long term (current) use of aspirin,7188,16.36
6,Gastro-esophageal reflux disease without esophagitis,6725,15.31
7,"Acute kidney failure, unspecified",6538,14.88
8,"Major depressive disorder, single episode, unspecified",5719,13.02
9,Long term (current) use of anticoagulants,5456,12.42
10,Atherosclerotic heart disease of native coronary artery without angina pectoris,5429,12.36


In [78]:
# dx present in the ED and identified as the primary problems
dx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/ndx, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Suicidal ideation,407,0.93
2,Pneumonia due to organism,302,0.69
3,"Sepsis, due to unspecified organism",277,0.63
4,Hyponatremia,274,0.62
5,Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),227,0.52
6,SBO (small bowel obstruction) (CMS-HCC),222,0.51
7,Acute chest pain,211,0.48
8,Small bowel obstruction (CMS-HCC),205,0.47
9,Neutropenic fever (CMS-HCC),194,0.44
10,Acute GI bleeding,190,0.43


In [47]:
# percentage of unique patients among all transfers
2324/2499

In [112]:
# cohort transfers with diagnoses
dx_tx <- left_join(cohort, dx)
nrow(dx_tx)
colnames(dx_tx)

ntx = length(unique(dx_tx$pat_enc_csn_id_coded)) #2499
ntx
length(unique(dx_tx$anon_id)) # 2324

ntxn = nrow(dx_tx %>% distinct(pat_enc_csn_id_coded, dx_name))
ntxn

# 11692
dx_tx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name, sort=TRUE) %>% 
    mutate(perc = round(100*n/11692, 2)) %>% head(20)

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Hyperlipidemia, unspecified",795,6.8
2,Other long term (current) drug therapy,723,6.18
3,Essential (primary) hypertension,687,5.88
4,Personal history of nicotine dependence,598,5.11
5,Long term (current) use of aspirin,528,4.52
6,"Acute kidney failure, unspecified",492,4.21
7,Acidosis,472,4.04
8,Atherosclerotic heart disease of native coronary artery without angina pectoris,428,3.66
9,Gastro-esophageal reflux disease without esophagitis,382,3.27
10,Long term (current) use of anticoagulants,368,3.15


In [111]:
#529
dx_tx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% 
            count(dx_name, sort=TRUE) %>% 
            mutate(perc = round(100*n/529, 2)) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),37,6.99
2,Septic shock (CMS-HCC),27,5.1
3,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",27,5.1
4,"Sepsis, due to unspecified organism",26,4.91
5,Hyperkalemia,25,4.73
6,Severe sepsis (CMS-HCC),25,4.73
7,Pneumonia due to organism,24,4.54
8,Subdural hematoma (CMS-HCC),24,4.54
9,Hyponatremia,20,3.78
10,Acute GI bleeding,19,3.59


In [109]:
# from 1 to 0 (ICUs to non-ICUs), 7979
dx_tx %>% filter(first_label==1, death_24hr_recent_label==0) %>%
            distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name, sort=TRUE) %>% #head(20)
            mutate(perc = round(100*n/7979, 2)) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Hyperlipidemia, unspecified",510,6.39
2,Essential (primary) hypertension,469,5.88
3,Other long term (current) drug therapy,442,5.54
4,Personal history of nicotine dependence,345,4.32
5,Long term (current) use of aspirin,322,4.04
6,"Acute kidney failure, unspecified",249,3.12
7,Atherosclerotic heart disease of native coronary artery without angina pectoris,242,3.03
8,Gastro-esophageal reflux disease without esophagitis,226,2.83
9,Long term (current) use of insulin,221,2.77
10,Acidosis,216,2.71


In [108]:
# from 0 to 1, 7457
dx_tx %>% filter(first_label==0, death_24hr_recent_label==1) %>%
            distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name, sort=TRUE) %>% #head(20)
            mutate(perc = round(100*n/7457, 2)) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Hyperlipidemia, unspecified",285,3.82
2,Other long term (current) drug therapy,281,3.77
3,Acidosis,256,3.43
4,Personal history of nicotine dependence,253,3.39
5,"Acute kidney failure, unspecified",243,3.26
6,Severe sepsis with septic shock,220,2.95
7,Essential (primary) hypertension,218,2.92
8,Long term (current) use of aspirin,206,2.76
9,"Sepsis, unspecified organism",203,2.72
10,Atherosclerotic heart disease of native coronary artery without angina pectoris,186,2.49


In [115]:
# from 1 to 0, primary ED, 354
dx_tx %>% filter(first_label==1, death_24hr_recent_label==0, ed=="Y", primary=="Y") %>%
            distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name, sort=TRUE) %>% #head(20)
            mutate(perc = round(100*n/354, 2)) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),36,10.17
2,Septic shock (CMS-HCC),24,6.78
3,Subdural hematoma (CMS-HCC),23,6.5
4,Hyperkalemia,22,6.21
5,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",20,5.65
6,Acute GI bleeding,17,4.8
7,Hyponatremia,16,4.52
8,SAH (subarachnoid hemorrhage) (CMS-HCC),15,4.24
9,Severe sepsis (CMS-HCC),15,4.24
10,"Closed fracture of multiple ribs of right side, initial encounter",14,3.95


In [116]:
# from 0 to 1 (non-ICUs to ICUs), primary ED, 255
dx_tx %>% filter(first_label==0, death_24hr_recent_label==1, ed=="Y", primary=="Y") %>%
            distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name, sort=TRUE) %>% #head(20)
            mutate(perc = round(100*n/255, 2)) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Sepsis, due to unspecified organism",16,6.27
2,Pneumonia due to organism,14,5.49
3,Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),11,4.31
4,Hypoxia,10,3.92
5,Severe sepsis (CMS-HCC),10,3.92
6,Acute pyelonephritis,7,2.75
7,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",7,2.75
8,Acute chest pain,6,2.35
9,Acute coronary syndrome (CMS-HCC),5,1.96
10,Acute on chronic systolic congestive heart failure (CMS-HCC),5,1.96
