## Descriptions:
- Query other tables on shc
- Some bigger queries cannot be done in this notebook. use separate SQL file in python instead

**Inputs**:  
- `1_4_cohort_diff_full_features`: contains cohort with most difference pdiff >= 0.3
  
**Outputs**: 


### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

"package 'bigrquery' was built under R version 4.0.5"
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.0

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union




### Set up and run queries

In [14]:
# CREDENTIALS depending on LOCATIONS:
# credential <- paste0("/home/", "minh084", "/.config/gcloud/application_default_credentials.json")

# local computer
# credential <- "C:/Users/User/AppData/Roaming/gcloud/application_default_credentials.json"

# Nero onprem
# credential <- "/home/minh084/.config/gcloud/application_default_credentials.json"

# Nero gcp notebook
credential <- "/home/jupyter/.config/gcloud/application_default_credentials.json"

project_id <- "som-nero-phi-jonc101"

Sys.setenv(GOOGLE_APPLICATION_CREDENTIALS = credential)
Sys.setenv(GCLOUD_PROJECT = project_id)
gargle::credentials_app_default()

NULL

In [15]:
library(DBI)
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = "shc_core" #, billing = project_id
)
con 
dbListTables(con)

<BigQueryConnection>
  Dataset: som-nero-phi-jonc101.shc_core
  Billing: som-nero-phi-jonc101

In [2]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
modeldir4 = "../../OutputTD/3_models/1_4_cohort"
# modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [3]:
cohort <- read.csv(file.path(modeldir4, "1_4_cohort_diff_full_features.csv")) %>% distinct()

nrow(cohort) # 71589 vs 69334
nrow(cohort %>% distinct())
nrow(cohort %>% select(anon_id) %>% distinct()) # 304
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 324

# count anon_id and csn, count unique in each group, but possible to have same ids in different groups
cohort %>%  distinct() %>% group_by(abs_diff0_24) %>% 
            summarise(nrows = n(), count_csn = n_distinct(pat_enc_csn_id_coded), count_mrn = n_distinct(anon_id)) %>% 
            arrange(-abs_diff0_24) %>% mutate(cum_csn = cumsum(count_csn), cum_mrn = cumsum(count_mrn))

`summarise()` ungrouping output (override with `.groups` argument)



abs_diff0_24,nrows,count_csn,count_mrn,cum_csn,cum_mrn
<dbl>,<int>,<int>,<int>,<int>,<int>
0.7,560,3,3,3,3
0.6,1042,7,7,10,10
0.5,6559,27,24,37,34
0.4,19690,80,77,117,111
0.3,43735,206,204,323,315


In [4]:
head(cohort, 1)
colnames(cohort)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,abs_diff0_24,diff0_True,diff24_True,feature_type,features,values,time,hr_before_admit
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>
1,JCcd3b9d,131267453587,2019-04-06 23:15:00,1,1,1,0.5339708,0.2470261,0.3,-0.47,-0.75,Procedures,LIMITED ULTRASOUND- CARDIAC TRANSTHORACIC ECHO,1,,


### Checking diagnosis to compare with total cohort

In [5]:
# all dx from 1_4_cohort, SQL query
dx <- read.csv(file.path(datadir, "4_1_dx_all.csv"))
nrow(dx)
length(unique(dx$pat_enc_csn_id_coded)) # full cohort is 43980

In [6]:
dx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/43932, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Other long term (current) drug therapy,12409,28.25
2,"Hyperlipidemia, unspecified",11532,26.25
3,Essential (primary) hypertension,10691,24.34
4,Personal history of nicotine dependence,9969,22.69
5,Long term (current) use of aspirin,7188,16.36
6,Gastro-esophageal reflux disease without esophagitis,6725,15.31
7,"Acute kidney failure, unspecified",6538,14.88
8,"Major depressive disorder, single episode, unspecified",5719,13.02
9,Long term (current) use of anticoagulants,5456,12.42
10,Atherosclerotic heart disease of native coronary artery without angina pectoris,5429,12.36


In [7]:
# dx present in the ED and identified as the primary problems
dx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/43932, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Suicidal ideation,407,0.93
2,Pneumonia due to organism,302,0.69
3,"Sepsis, due to unspecified organism",277,0.63
4,Hyponatremia,274,0.62
5,Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),227,0.52
6,SBO (small bowel obstruction) (CMS-HCC),222,0.51
7,Acute chest pain,211,0.48
8,Small bowel obstruction (CMS-HCC),205,0.47
9,Neutropenic fever (CMS-HCC),194,0.44
10,Acute GI bleeding,190,0.43


In [8]:
105/252

In [9]:
# cohort transfers with diagnoses
dx_diff4 <- left_join(diff4, dx)
nrow(dx_diff4)
length(unique(dx_diff4$pat_enc_csn_id_coded))
length(unique(dx_diff4$anon_id))
dx_diff4 %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/1120, 2)) %>% arrange(-n) %>% head(20)

ERROR: Error in left_join(diff4, dx): object 'diff4' not found


### ADT table

In [20]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, c.admit_time, c.first_label, c.death_24hr_recent_label,
        c.death_24hr_max_label, c.pred_first, c.pred_death_24hr_recent,
    adt.event_type_c, adt.event_type, adt.pat_class_c, adt.pat_class, adt.base_pat_class_c, adt.base_pat_class,
    adt.pat_service_c, adt.pat_service, adt.pat_lvl_of_care_c, adt.pat_lv_of_care, adt.accommodation_c, adt.accomodation, 
    adt.in_event_type_c, adt.in_event_type, adt.out_event_type_c, adt.out_event_type, adt.from_base_class_c, adt.from_base_class,
    adt.to_base_class_c, adt.to_base_class, adt.seq_num_in_enc, adt.seq_num_in_bed_min
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.adt` as adt
ON 
    (c.anon_id = adt.anon_id and c.pat_enc_csn_id_coded = adt.pat_enc_csn_id_coded)
"
# submit the query:
update.packages('Rcpp')
df <- dbGetQuery(con, q)
dim(df)
colnames(df)

“NAs produced by integer overflow”


In [21]:
write.csv(df, file.path(datadir, "4_1_diff_adt.csv"), row.names=FALSE)

In [23]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [24]:
write.csv(x1, file.path(datadir, "4_2_diff_demographic.csv"), row.names=FALSE)

In [11]:
# this doesn't seem right, redo with a separate SQL file as it is too large to be in R notebook
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.line, x.dx_name, x.primary, x.chronic, x.principal, x.hospital_pl, x.ed, x.present_on_adm
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)
"
# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

“NAs produced by integer overflow”


In [12]:
head(x2)

anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
JCe5f7b6,,1,Hyponatremia,Y,N,,,Y,
JCdc571d,,6,Cough,N,N,,,N,
JCe35ee6,,11,"Dementia without behavioral disturbance, unspecified dementia type (CMS-HCC)",N,Y,,,N,
JCd1f333,,3,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),N,N,,,N,
JCe09f7a,,1,"Vomiting, unspecified",,,,,,
JCe439dd,,7,Bipolar 1 disorder (CMS-HCC),N,N,,,N,


In [9]:
# this doesn't seem right, redo with a separate SQL file as it is too large to be in R notebook
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.line, x.dx_name, x.primary, x.chronic, x.principal, x.hospital_pl, x.ed, x.present_on_adm
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (x.primary = 'Y')
"
# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

“NAs produced by integer overflow”


In [10]:
head(x2)

anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
JCe5f7b6,,1,Hyponatremia,Y,N,,,Y,
JCda6b3f,,1,"MVC (motor vehicle collision), initial encounter",Y,N,,,Y,
JCd1ef38,,1,"Atherosclerosis of native coronary artery of native heart, angina presence unspecified",Y,N,,,N,
JCeb2df6,,3,"Malignant neoplasm of ovary, unspecified laterality (CMS-HCC)",Y,N,,,N,
JCd5f913,,1,Acute respiratory failure with hypoxia and hypercarbia (CMS-HCC),Y,N,,,Y,
JCd69bf6,,2,ALS (amyotrophic lateral sclerosis) (CMS-HCC),Y,N,,,N,


In [42]:
write.csv(x2, file.path(datadir, "4_2_diff_dx_primary.csv"), row.names=FALSE)

### Read the queried file back

In [10]:
adt <- read.csv(file.path(datadir, "4_2_diff_adt.csv"))
dim(adt) # should have 1385721
colnames(adt)

In [11]:
head(adt, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,event_type_c,event_type,pat_class_c,pat_class,base_pat_class_c,base_pat_class,pat_service_c,pat_service,pat_lvl_of_care_c,pat_lv_of_care,accommodation_c,accomodation,in_event_type_c,in_event_type,out_event_type_c,out_event_type,from_base_class_c,from_base_class,to_base_class_c,to_base_class,seq_num_in_enc,seq_num_in_bed_min
Unnamed: 0_level_1,<chr>,<lgl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>
1,JC2a1bd68,,2019-12-24 23:20:00,1,1,1,0.5241561,0.2500841,1,Admission,112,Emergency Services,3,Emergency,100,Emergency,,,10001,Ward,1,Admission,,,0,,3,Emergency,1,1


In [12]:
cohort %>% distinct(pat_enc_csn_id_coded, first_label, death_24hr_recent_label) %>% count(first_label, death_24hr_recent_label, sort=T)
adt %>% count(event_type, event_type_c, sort=TRUE)
adt %>% count(pat_class, pat_class_c, sort=TRUE)
adt %>% count(base_pat_class, base_pat_class_c, sort=TRUE)
adt %>% count(pat_service, pat_service_c, sort=TRUE)
adt %>% count(pat_lv_of_care, pat_lvl_of_care_c, sort=TRUE)
adt %>% count(accomodation, accommodation_c, sort=TRUE)
adt %>% count(in_event_type, in_event_type_c, sort=TRUE)
adt %>% count(out_event_type, out_event_type_c, sort=TRUE)
adt %>% count(from_base_class, from_base_class_c, sort=TRUE)
adt %>% count(to_base_class, to_base_class_c, sort=TRUE)
adt %>% count(seq_num_in_enc, sort=TRUE)
adt %>% count(seq_num_in_bed_min, sort=TRUE)

first_label,death_24hr_recent_label,n
<int>,<int>,<int>
1,1,113
0,0,102
1,0,90
0,1,18


event_type,event_type_c,n
<chr>,<int>,<int>
Census,6,419221
Transfer In,3,305337
Transfer Out,4,305337
Patient Update,5,218436
Admission,1,69334
Discharge,2,68056


pat_class,pat_class_c,n
<chr>,<int>,<int>
Inpatient,126,1182999
Emergency Services,112,200344
Observation,128,1727
OP Surgery/Procedure,122,651


base_pat_class,base_pat_class_c,n
<chr>,<int>,<int>
,,1246039
Emergency,3.0,69334
Inpatient,1.0,69334
Outpatient,2.0,1014


pat_service,pat_service_c,n
<chr>,<int>,<int>
Critical Care,151.0,283991
Medicine,39.0,191213
Emergency,100.0,141033
Emergency Medicine,187.0,120906
General Medicine (University),153.0,87235
Trauma,72.0,86612
General Surgery,59.0,81189
Cardiology,155.0,61527
General Medicine (PAMF),154.0,30827
Neurosurgery,62.0,30067


pat_lv_of_care,pat_lvl_of_care_c,n
<chr>,<int>,<int>
Acute Care (Assessment or intervention q4-8),5.0,561858
Critical Care,8.0,404635
IICU/Intermediate Care (Assessment or intervention q2-4),9.0,218827
,,200282
Newborn Nursery - VC Only,68.0,119


accomodation,accommodation_c,n
<chr>,<int>,<int>
Ward,10001,563973
Private,1,556328
Semi-Private,2,265420


in_event_type,in_event_type_c,n
<chr>,<int>,<int>
Census,6.0,419221
,,373393
Transfer In,3.0,305085
Patient Update,5.0,148340
Admission,1.0,139682


out_event_type,out_event_type_c,n
<chr>,<int>,<int>
Census,6.0,419221
,,374671
Transfer Out,4.0,305085
Patient Update,5.0,148340
Discharge,2.0,138404


from_base_class,from_base_class_c,n
<chr>,<int>,<int>
Inpatient,1,1114462
Emergency,3,200344
,0,69334
,4,930
Outpatient,2,651


to_base_class,to_base_class_c,n
<chr>,<int>,<int>
Inpatient,1,1115775
Emergency,3,200092
,0,68056
,4,930
Outpatient,2,868


seq_num_in_enc,n
<int>,<int>
1,69334
2,69334
3,69334
4,69334
5,69334
6,68365
7,67843
8,67487
9,66696
10,64255


seq_num_in_bed_min,n
<int>,<int>
1,1369116
2,16292
3,313


In [14]:
demo <- read.csv(file.path(datadir, "4_2_diff_demographic.csv"))
dim(demo) # should have 69334
colnames(demo)

In [15]:
head(demo,1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,INTRPTR_NEEDED_YN,CHARLSON_SCORE,N_HOSPITALIZATIONS,DAYS_IN_HOSPITAL
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<int>,<int>,<int>
1,JCd5f913,,False,11,14,96


In [26]:
head(coh_dx, 1)
colnames(coh_dx)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,abs_diff0_24,diff0_True,diff24_True,feature_type,features,values,time,hr_before_admit,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JCda4f94,131274348480,2019-08-07 20:38:00,0,0,0,0.8106468,0.4368035,0.4,0.81,0.44,Diagnosis,E11.22,1,,,1,Type 1 diabetes mellitus with ketoacidosis without coma,,,,,,Yes


In [17]:
# only 120 of obs above 0.4

coh_dx <- cohort %>% distinct() %>% filter(abs_diff0_24 >= 0.4) %>% inner_join(dx) # %>% distinct()
nrow(coh_dx) #2152159 same as inner_join for threshold at 0.3 --> distinct 2148938, duplicates comes from cohort
length(unique(coh_dx$pat_enc_csn_id_coded))
length(unique(dx$pat_enc_csn_id_coded))
length(setdiff(coh_dx$pat_enc_csn_id_coded, dx$pat_enc_csn_id_coded))
length(setdiff(dx$pat_enc_csn_id_coded, coh_dx$pat_enc_csn_id_coded))

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [18]:
summary(coh_dx$line)

coh_dx %>% count(primary, sort=TRUE)
coh_dx %>% count(chronic, sort=TRUE)
coh_dx %>% count(hospital_pl,  sort=TRUE)
coh_dx %>% count(principal, sort=TRUE)
coh_dx %>% count(ed, sort=TRUE)
coh_dx %>% count(present_on_adm, sort=TRUE)
coh_dx %>% count(dx_name, sort=TRUE) %>% head(20)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1.00    4.00    9.00   12.36   19.00   53.00    5718 

primary,n
<chr>,<int>
,709610
N,112884
Y,27426


chronic,n
<chr>,<int>
,704997
N,140310
Y,4613


hospital_pl,n
<chr>,<int>
,844202
Y,5493
N,225


principal,n
<chr>,<int>
,844202
N,4675
Y,1043


ed,n
<chr>,<int>
,709610
N,90503
Y,49807


present_on_adm,n
<chr>,<int>
Yes,457212
Exempt from POA reporting,181645
,174002
No,34797
Unknown,2264


Unnamed: 0_level_0,dx_name,n
Unnamed: 0_level_1,<chr>,<int>
1,Other long term (current) drug therapy,13427
2,"Hyperlipidemia, unspecified",13211
3,Long term (current) use of insulin,11874
4,"Major depressive disorder, single episode, unspecified",8850
5,Atherosclerotic heart disease of native coronary artery without angina pectoris,8822
6,Type 1 diabetes mellitus with ketoacidosis without coma,8158
7,Long term (current) use of aspirin,7740
8,Essential (primary) hypertension,7581
9,Dehydration,6834
10,Old myocardial infarction,6502


In [19]:
coh_dx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/120, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Other long term (current) drug therapy,50,41.67
2,"Hyperlipidemia, unspecified",49,40.83
3,Long term (current) use of insulin,42,35.0
4,Essential (primary) hypertension,40,33.33
5,Long term (current) use of aspirin,32,26.67
6,"Major depressive disorder, single episode, unspecified",27,22.5
7,Atherosclerotic heart disease of native coronary artery without angina pectoris,26,21.67
8,Dehydration,25,20.83
9,Personal history of nicotine dependence,25,20.83
10,Gastro-esophageal reflux disease without esophagitis,20,16.67


In [20]:
coh_dx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/120, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),7,5.83
2,Diabetic ketoacidosis without coma associated with other specified diabetes mellitus (CMS-HCC),6,5.0
3,Diabetic ketoacidosis without coma associated with diabetes mellitus due to underlying condition (CMS-HCC),5,4.17
4,Diabetic ketoacidosis without coma associated with type 2 diabetes mellitus (CMS-HCC),5,4.17
5,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",3,2.5
6,Diabetic ketoacidosis with coma associated with type 1 diabetes mellitus (CMS-HCC),2,1.67
7,Hyperglycemic hyperosmolar nonketotic coma (CMS-HCC),2,1.67
8,Hypertensive emergency,2,1.67
9,Intracranial hemorrhage (CMS-HCC),2,1.67
10,Subarachnoid hemorrhage (CMS-HCC),2,1.67


In [22]:
coh_dx %>% filter(ed == "Y", primary=="Y") %>% count(dx_name, sort=TRUE) %>% head(20)

Unnamed: 0_level_0,dx_name,n
Unnamed: 0_level_1,<chr>,<int>
1,Diabetic ketoacidosis without coma associated with diabetes mellitus due to underlying condition (CMS-HCC),1566
2,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),1165
3,Diabetic ketoacidosis without coma associated with other specified diabetes mellitus (CMS-HCC),1129
4,Hypertensive emergency,916
5,"Chest pain, unspecified type",717
6,Diabetic ketoacidosis without coma associated with type 2 diabetes mellitus (CMS-HCC),713
7,Flash pulmonary edema (CMS-HCC),674
8,Acute respiratory failure with hypoxia and hypercarbia (CMS-HCC),666
9,"Hypertension, unspecified type",602
10,Subarachnoid hemorrhage (CMS-HCC),478


### MERGE MRN

In [35]:
reids <- read.csv(file.path(datadir, "top_reIDs.csv"))

diff4 <- cohort %>% filter(abs_diff0_24 >=0.4) %>% distinct(anon_id)
nrow(diff4)
sum(diff4$anon_id %in% reids$anon_id)

reids <- left_join(reids, cohort) %>% filter(abs_diff0_24 >=0.4) %>% 
            distinct(mrn, anon_id, pat_enc_csn_id_coded, admit_time, jitter,
                     first_label, death_24hr_recent_label, pred_first, pred_death_24hr_recent, abs_diff0_24) %>%
            mutate(transfer = ifelse(first_label != death_24hr_recent_label, 1, 0)) %>%
            arrange(-abs_diff0_24)

reids %>% summarise(count_mrn = n_distinct(anon_id))
nrow(reids) #117

Joining, by = "anon_id"



count_mrn
<int>
105


In [36]:
write.csv(reids, file.path(datadir, "top105_reIDs_sorted.csv"), row.names=FALSE)

In [41]:
traj <- read.csv(file.path(datadir, "02_combined_ip_emerg_traj.csv")) %>% select(-X)
nrow(traj)

In [47]:
head(traj,1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,first_ip_lv_of_care,first_ip_event_type,first_ip_pat_service,last_emerg_lv_of_care,last_emerg_event_type,last_emerg_pat_service,trajectory,trajectory_length
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,JCe78a06,131062667066,Intermediate Care - With Cardiac Monitor,Transfer In,General Medicine (PAMF),,Transfer Out,Emergency,Intermediate Care - With Cardiac Monitor,1


In [37]:
head(reids)

Unnamed: 0_level_0,mrn,anon_id,jitter,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,pred_first,pred_death_24hr_recent,abs_diff0_24,transfer
Unnamed: 0_level_1,<int>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
1,13638739,JCd91ab7,9 days 00:00:00.000000000,131271804263,2019-07-11 21:43:00,1,0,0.8406619,0.16503982,0.7,1
2,60827953,JCe6286a,18 days 00:00:00.000000000,131276655148,2019-10-19 02:44:00,1,0,0.7492048,0.08195348,0.7,1
3,43204973,JCd1a421,-3 days +00:00:00.000000000,131265379414,2019-03-21 13:11:00,1,1,0.796905,0.06790455,0.7,0
4,76629740,JC29fe5a7,-17 days +00:00:00.000000000,131273666433,2019-07-17 11:48:00,1,1,0.7369563,0.10290046,0.6,0
5,12964771,JCd3262e,-19 days +00:00:00.000000000,131275001383,2019-08-15 16:54:00,1,0,0.8597781,0.27967762,0.6,1
6,76557537,JC2a31a2b,-19 days +00:00:00.000000000,131273021618,2019-07-01 07:21:00,1,0,0.8841977,0.29646083,0.6,1


In [45]:
comb_diff <- left_join(reids, traj)
nrow(comb_diff)

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [49]:
head(comb_diff, n=10) # 0.6 and 0.7

Unnamed: 0_level_0,mrn,anon_id,jitter,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,pred_first,pred_death_24hr_recent,abs_diff0_24,transfer,first_ip_lv_of_care,first_ip_event_type,first_ip_pat_service,last_emerg_lv_of_care,last_emerg_event_type,last_emerg_pat_service,trajectory,trajectory_length
Unnamed: 0_level_1,<int>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,13638739,JCd91ab7,9 days 00:00:00.000000000,131271804263,2019-07-11 21:43:00,1,0,0.8406619,0.16503982,0.7,1,Critical Care,Patient Update,Emergency Critical Care,,Transfer In,Emergency,Critical Care -> IICU/Intermediate Care (Assessment or intervention q2-4),2
2,60827953,JCe6286a,18 days 00:00:00.000000000,131276655148,2019-10-19 02:44:00,1,0,0.7492048,0.08195348,0.7,1,Critical Care,Patient Update,Emergency Critical Care,,Transfer In,Emergency Medicine,Critical Care -> IICU/Intermediate Care (Assessment or intervention q2-4),2
3,43204973,JCd1a421,-3 days +00:00:00.000000000,131265379414,2019-03-21 13:11:00,1,1,0.796905,0.06790455,0.7,0,Critical Care,Patient Update,Emergency,,Admission,Emergency,Critical Care,1
4,76629740,JC29fe5a7,-17 days +00:00:00.000000000,131273666433,2019-07-17 11:48:00,1,1,0.7369563,0.10290046,0.6,0,Critical Care,Patient Update,Emergency Medicine,,Patient Update,Emergency Medicine,Critical Care,1
5,12964771,JCd3262e,-19 days +00:00:00.000000000,131275001383,2019-08-15 16:54:00,1,0,0.8597781,0.27967762,0.6,1,Critical Care,Patient Update,Emergency Medicine,,Patient Update,Emergency Medicine,Critical Care -> Acute Care (Assessment or intervention q4-8),2
6,76557537,JC2a31a2b,-19 days +00:00:00.000000000,131273021618,2019-07-01 07:21:00,1,0,0.8841977,0.29646083,0.6,1,Critical Care,Patient Update,Neurocritical Care,,Census,Emergency,Critical Care -> Acute Care (Assessment or intervention q4-8),2
7,76912245,JC2a126ab,-16 days +00:00:00.000000000,131275893768,2019-09-04 05:58:00,0,0,0.8152507,0.25902901,0.6,0,Acute Care (Assessment or intervention q4-8),Patient Update,Medicine,,Transfer In,Emergency Medicine,Acute Care (Assessment or intervention q4-8),1
8,70249800,JCe0e0be,31 days 00:00:00.000000000,131283367768,2020-03-15 15:18:00,1,0,0.7303019,0.17409155,0.6,1,Critical Care,Patient Update,Emergency,,Admission,Emergency,Critical Care -> Acute Care (Assessment or intervention q4-8),2
9,60269636,JCe09f7a,-22 days +00:00:00.000000000,131283156969,2020-01-19 11:56:00,1,0,0.8493499,0.26897129,0.6,1,Critical Care,Patient Update,Critical Care,,Transfer In,Emergency Medicine,Critical Care -> IICU/Intermediate Care (Assessment or intervention q2-4),2
10,27196021,JCceb298,-10 days +00:00:00.000000000,131285560953,2020-03-06 04:21:00,1,0,0.7893417,0.21697571,0.6,1,Critical Care,Patient Update,Neurocritical Care,,Transfer In,Emergency Medicine,Critical Care -> Acute Care (Assessment or intervention q4-8),2


In [52]:
comb_diff %>% filter(abs_diff0_24 == 0.5) %>% arrange(mrn, admit_time)

mrn,anon_id,jitter,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,pred_first,pred_death_24hr_recent,abs_diff0_24,transfer,first_ip_lv_of_care,first_ip_event_type,first_ip_pat_service,last_emerg_lv_of_care,last_emerg_event_type,last_emerg_pat_service,trajectory,trajectory_length
<int>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
2622280,JCd2b078,22 days 00:00:00.000000000,131281836013,2020-02-05 06:40:00,1,0,0.6533526,0.18836503,0.5,1,Critical Care,Patient Update,ICU Trauma/GenSurg,,Patient Update,Emergency Medicine,Critical Care -> Acute Care (Assessment or intervention q4-8),2
3404373,JC2a1b3e1,-25 days +00:00:00.000000000,131266865289,2019-03-30 17:11:00,0,0,0.8686259,0.32946409,0.5,0,IICU/Intermediate Care (Assessment or intervention q2-4),Patient Update,Cardiology,,Patient Update,Emergency Medicine,IICU/Intermediate Care (Assessment or intervention q2-4) -> Acute Care (Assessment or intervention q4-8),2
3404373,JC2a1b3e1,-25 days +00:00:00.000000000,131271401201,2019-06-03 17:42:00,0,0,0.8650639,0.34595124,0.5,0,IICU/Intermediate Care (Assessment or intervention q2-4),Patient Update,Cardiology,,Admission,Emergency,IICU/Intermediate Care (Assessment or intervention q2-4),1
5197512,JCdc5d49,-15 days +00:00:00.000000000,131262480872,2019-01-05 17:04:00,1,1,0.6639243,0.20044141,0.5,0,Critical Care,Patient Update,Critical Care,,Admission,Emergency,Critical Care,1
6386387,JCe358e6,-25 days +00:00:00.000000000,131282031674,2019-12-23 02:39:00,1,0,0.7744861,0.27566563,0.5,1,Critical Care,Patient Update,Critical Care,,Patient Update,Emergency Medicine,Critical Care -> Acute Care (Assessment or intervention q4-8),2
8332611,JCcf7ee2,18 days 00:00:00.000000000,131261541119,2019-01-15 06:59:00,1,0,0.8834838,0.39151948,0.5,1,Critical Care,Patient Update,ICU Trauma/GenSurg,,Transfer In,Emergency,Critical Care -> Acute Care (Assessment or intervention q4-8),2
8560666,JCe1e504,-31 days +00:00:00.000000000,131268059258,2019-04-18 11:45:00,0,0,0.7209007,0.21101473,0.5,0,IICU/Intermediate Care (Assessment or intervention q2-4),Patient Update,General Medicine (PAMF),,Admission,Emergency,IICU/Intermediate Care (Assessment or intervention q2-4),1
13554613,JCdb215e,-28 days +00:00:00.000000000,131285117411,2020-02-12 17:35:00,1,0,0.7373989,0.23933698,0.5,1,Critical Care,Patient Update,Critical Care,,Patient Update,Emergency Medicine,Critical Care -> IICU/Intermediate Care (Assessment or intervention q2-4) -> Acute Care (Assessment or intervention q4-8),3
13638739,JCd91ab7,9 days 00:00:00.000000000,131276678530,2019-10-10 03:52:00,1,0,0.7376401,0.21183136,0.5,1,Critical Care,Patient Update,Critical Care,,Patient Update,Emergency Medicine,Critical Care -> Acute Care (Assessment or intervention q4-8),2
13638739,JCd91ab7,9 days 00:00:00.000000000,131281472394,2020-01-16 18:22:00,1,0,0.7327157,0.24238043,0.5,1,Critical Care,Patient Update,Critical Care,,Transfer In,Emergency Medicine,Critical Care -> Acute Care (Assessment or intervention q4-8),2


In [55]:
comb_diff %>% filter(abs_diff0_24 == 0.4) %>% arrange(mrn, admit_time) %>% select(mrn, anon_id, admit_time, jitter)

mrn,anon_id,admit_time,jitter
<int>,<chr>,<chr>,<chr>
320721,JCd79e21,2019-01-07 05:12:00,-24 days +00:00:00.000000000
426379,JCccaf40,2019-10-31 04:28:00,-9 days +00:00:00.000000000
1233022,JCd71da6,2019-02-08 06:55:00,-23 days +00:00:00.000000000
1550631,JCd54f0a,2019-11-01 23:35:00,4 days 00:00:00.000000000
2010247,JCe45f13,2019-08-23 21:04:00,31 days 00:00:00.000000000
2591014,JCe4cc9a,2020-03-06 03:42:00,22 days 00:00:00.000000000
3404373,JC2a1b3e1,2019-03-08 15:21:00,-25 days +00:00:00.000000000
3404373,JC2a1b3e1,2019-06-10 15:37:00,-25 days +00:00:00.000000000
3726924,JCda4f94,2019-08-07 20:38:00,-12 days +00:00:00.000000000
3795143,JCe01b09,2019-09-20 06:04:00,-17 days +00:00:00.000000000


In [57]:
mrns <- c(320721, 426379,5197512, 5197520, 8304909, 9996489, 5197512, 6386387, 8332611, 8560666)
anons <- c('JCd79e21', 'JCccaf40', 'JCdc5d49', 'JCd30547', 'JCe2435b', 'JCcf91c6', 'JCdc5d49', 'JCe358e6', 'JCcf7ee2', 'JCe1e504')
length(mrns)
length(anons)