## Descriptions:
- Query other tables on shc
- Some bigger queries cannot be done in this notebook. use separate SQL file in python instead

**Inputs**:  
- `1_4_cohort_diff_full_features`: contains cohort with most difference pdiff >= 0.3
  
**Outputs**: 


### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

"package 'bigrquery' was built under R version 4.0.5"
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.0

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union




### Set up and run queries

In [14]:
# CREDENTIALS depending on LOCATIONS:
# credential <- paste0("/home/", "minh084", "/.config/gcloud/application_default_credentials.json")

# local computer
# credential <- "C:/Users/User/AppData/Roaming/gcloud/application_default_credentials.json"

# Nero onprem
# credential <- "/home/minh084/.config/gcloud/application_default_credentials.json"

# Nero gcp notebook
credential <- "/home/jupyter/.config/gcloud/application_default_credentials.json"

project_id <- "som-nero-phi-jonc101"

Sys.setenv(GOOGLE_APPLICATION_CREDENTIALS = credential)
Sys.setenv(GCLOUD_PROJECT = project_id)
gargle::credentials_app_default()

NULL

In [15]:
library(DBI)
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = "shc_core" #, billing = project_id
)
con 
dbListTables(con)

<BigQueryConnection>
  Dataset: som-nero-phi-jonc101.shc_core
  Billing: som-nero-phi-jonc101

In [2]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
modeldir4 = "../../OutputTD/3_models/1_4_cohort"
# modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [5]:
cohort <- read.csv(file.path(modeldir4, "1_4_cohort_diff_full_features.csv")) %>% distinct()

nrow(cohort) # 71589 vs 69334
nrow(cohort %>% distinct())
nrow(cohort %>% select(anon_id) %>% distinct()) # 304
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 324

# count anon_id and csn, count unique in each group, but possible to have same ids in different groups
cohort %>%  distinct() %>% group_by(abs_diff0_24) %>% 
            summarise(nrows = n(), count_csn = n_distinct(pat_enc_csn_id_coded), count_mrn = n_distinct(anon_id)) %>% 
            arrange(-abs_diff0_24) %>% mutate(cum_csn = cumsum(count_csn), cum_mrn = cumsum(count_mrn))

`summarise()` ungrouping output (override with `.groups` argument)



abs_diff0_24,nrows,count_csn,count_mrn,cum_csn,cum_mrn
<dbl>,<int>,<int>,<int>,<int>,<int>
0.7,560,3,3,3,3
0.6,1042,7,7,10,10
0.5,6559,27,24,37,34
0.4,19690,80,77,117,111
0.3,43735,206,204,323,315


In [7]:
head(cohort, 1)
colnames(cohort)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,abs_diff0_24,diff0_True,diff24_True,feature_type,features,values,time,hr_before_admit
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>
1,JCcd3b9d,131267453587,2019-04-06 23:15:00,1,1,1,0.5339708,0.2470261,0.3,-0.47,-0.75,Procedures,LIMITED ULTRASOUND- CARDIAC TRANSTHORACIC ECHO,1,,


In [8]:
# check with previous set sent to Morteza
reid <- read.csv(file.path(datadir, "topIDre.csv"))
nrow(reid)
head(reid)

Unnamed: 0_level_0,X,MRN,ANON_ID,JITTER,anon_id
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>
1,0,30344139,JCe9b234,-24 days +00:00:00.000000000,JCe9b234
2,1,76189257,JC2a12b37,27 days 00:00:00.000000000,JC2a12b37
3,2,35588730,JCe03bac,-12 days +00:00:00.000000000,JCe03bac
4,3,19039593,JCe6f82d,11 days 00:00:00.000000000,JCe6f82d
5,4,19998962,JCe3b5e6,11 days 00:00:00.000000000,JCe3b5e6
6,5,34007724,JCd5f913,-11 days +00:00:00.000000000,JCd5f913


In [13]:
# filter those with diff >= 0.4
diff4 <- cohort %>% filter(abs_diff0_24 >=0.4) %>% distinct(anon_id)
nrow(diff4)
sum(diff4$anon_id %in% reid$anon_id) # missing 5

diff4 %>% filter(!anon_id %in% reid$anon_id) %>% select(anon_id)
setdiff(diff4$anon_id, reid$anon_id)
# setdiff(reid$anon_id, diff4$anon_id)

anon_id
<chr>
JC2a1185a
JC2a04c24
JCd25b42
JCcf91c6
JCe4a797


### Checking diagnosis to compare with total cohort

In [14]:
# all dx from 1_4_cohort, SQL query
dx <- read.csv(file.path(datadir, "4_1_dx_all.csv"))
nrow(dx)
length(unique(dx$pat_enc_csn_id_coded)) # full cohort is 43980

In [15]:
dx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/43932, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Other long term (current) drug therapy,12409,28.25
2,"Hyperlipidemia, unspecified",11532,26.25
3,Essential (primary) hypertension,10691,24.34
4,Personal history of nicotine dependence,9969,22.69
5,Long term (current) use of aspirin,7188,16.36
6,Gastro-esophageal reflux disease without esophagitis,6725,15.31
7,"Acute kidney failure, unspecified",6538,14.88
8,"Major depressive disorder, single episode, unspecified",5719,13.02
9,Long term (current) use of anticoagulants,5456,12.42
10,Atherosclerotic heart disease of native coronary artery without angina pectoris,5429,12.36


In [16]:
# dx present in the ED and identified as the primary problems
dx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/43932, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Suicidal ideation,407,0.93
2,Pneumonia due to organism,302,0.69
3,"Sepsis, due to unspecified organism",277,0.63
4,Hyponatremia,274,0.62
5,Non-ST elevation myocardial infarction (NSTEMI) (CMS-HCC),227,0.52
6,SBO (small bowel obstruction) (CMS-HCC),222,0.51
7,Acute chest pain,211,0.48
8,Small bowel obstruction (CMS-HCC),205,0.47
9,Neutropenic fever (CMS-HCC),194,0.44
10,Acute GI bleeding,190,0.43


In [18]:
105/252

In [17]:
# cohort transfers with diagnoses
dx_diff4 <- left_join(diff4, dx)
nrow(dx_diff4)
length(unique(dx_diff4$pat_enc_csn_id_coded))
length(unique(dx_diff4$anon_id))
dx_diff4 %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/1120, 2)) %>% arrange(-n) %>% head(20)

Joining, by = "anon_id"



Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Long term (current) use of insulin,114,10.18
2,Other long term (current) drug therapy,100,8.93
3,"Hyperlipidemia, unspecified",95,8.48
4,Long term (current) use of aspirin,66,5.89
5,Essential (primary) hypertension,65,5.8
6,"Major depressive disorder, single episode, unspecified",56,5.0
7,Personal history of nicotine dependence,56,5.0
8,Atherosclerotic heart disease of native coronary artery without angina pectoris,55,4.91
9,Type 1 diabetes mellitus with ketoacidosis without coma,54,4.82
10,Acidosis,47,4.2


### ADT table

In [20]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, c.admit_time, c.first_label, c.death_24hr_recent_label,
        c.death_24hr_max_label, c.pred_first, c.pred_death_24hr_recent,
    adt.event_type_c, adt.event_type, adt.pat_class_c, adt.pat_class, adt.base_pat_class_c, adt.base_pat_class,
    adt.pat_service_c, adt.pat_service, adt.pat_lvl_of_care_c, adt.pat_lv_of_care, adt.accommodation_c, adt.accomodation, 
    adt.in_event_type_c, adt.in_event_type, adt.out_event_type_c, adt.out_event_type, adt.from_base_class_c, adt.from_base_class,
    adt.to_base_class_c, adt.to_base_class, adt.seq_num_in_enc, adt.seq_num_in_bed_min
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.adt` as adt
ON 
    (c.anon_id = adt.anon_id and c.pat_enc_csn_id_coded = adt.pat_enc_csn_id_coded)
"
# submit the query:
update.packages('Rcpp')
df <- dbGetQuery(con, q)
dim(df)
colnames(df)

“NAs produced by integer overflow”


In [21]:
write.csv(df, file.path(datadir, "4_1_diff_adt.csv"), row.names=FALSE)

In [23]:
q = "
SELECT c.anon_id, c.pat_enc_csn_id_coded, 
    x.INTRPTR_NEEDED_YN, x.CHARLSON_SCORE, x.N_HOSPITALIZATIONS, x.DAYS_IN_HOSPITAL
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.demographic` as x
ON 
    (c.anon_id = x.anon_id)
"
# submit the query:
update.packages('Rcpp')
x1 <- dbGetQuery(con, q)
dim(x1)
colnames(x1)

“NAs produced by integer overflow”


In [24]:
write.csv(x1, file.path(datadir, "4_2_diff_demographic.csv"), row.names=FALSE)

In [11]:
# this doesn't seem right, redo with a separate SQL file as it is too large to be in R notebook
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.line, x.dx_name, x.primary, x.chronic, x.principal, x.hospital_pl, x.ed, x.present_on_adm
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)
"
# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

“NAs produced by integer overflow”


In [12]:
head(x2)

anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
JCe5f7b6,,1,Hyponatremia,Y,N,,,Y,
JCdc571d,,6,Cough,N,N,,,N,
JCe35ee6,,11,"Dementia without behavioral disturbance, unspecified dementia type (CMS-HCC)",N,Y,,,N,
JCd1f333,,3,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),N,N,,,N,
JCe09f7a,,1,"Vomiting, unspecified",,,,,,
JCe439dd,,7,Bipolar 1 disorder (CMS-HCC),N,N,,,N,


In [9]:
# this doesn't seem right, redo with a separate SQL file as it is too large to be in R notebook
q = "
SELECT distinct c.anon_id, c.pat_enc_csn_id_coded, 
    x.line, x.dx_name, x.primary, x.chronic, x.principal, x.hospital_pl, x.ed, x.present_on_adm
FROM 
    `som-nero-phi-jonc101.triageTD.1_4_cohort_24hrpreadmit_diff_full_features` as c
JOIN 
    `som-nero-phi-jonc101.shc_core.diagnosis_code` as x
ON 
    (c.anon_id = x.anon_id and c.pat_enc_csn_id_coded = x.pat_enc_csn_id_jittered)

WHERE (x.primary = 'Y')
"
# submit the query:
update.packages('Rcpp')
x2 <- dbGetQuery(con, q)
dim(x2)
colnames(x2)

“NAs produced by integer overflow”


In [10]:
head(x2)

anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
JCe5f7b6,,1,Hyponatremia,Y,N,,,Y,
JCda6b3f,,1,"MVC (motor vehicle collision), initial encounter",Y,N,,,Y,
JCd1ef38,,1,"Atherosclerosis of native coronary artery of native heart, angina presence unspecified",Y,N,,,N,
JCeb2df6,,3,"Malignant neoplasm of ovary, unspecified laterality (CMS-HCC)",Y,N,,,N,
JCd5f913,,1,Acute respiratory failure with hypoxia and hypercarbia (CMS-HCC),Y,N,,,Y,
JCd69bf6,,2,ALS (amyotrophic lateral sclerosis) (CMS-HCC),Y,N,,,N,


In [42]:
write.csv(x2, file.path(datadir, "4_2_diff_dx_primary.csv"), row.names=FALSE)

### Read the queried file back

In [10]:
adt <- read.csv(file.path(datadir, "4_2_diff_adt.csv"))
dim(adt) # should have 1385721
colnames(adt)

In [12]:
head(adt, 1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,admit_time,first_label,death_24hr_recent_label,death_24hr_max_label,pred_first,pred_death_24hr_recent,event_type_c,event_type,pat_class_c,pat_class,base_pat_class_c,base_pat_class,pat_service_c,pat_service,pat_lvl_of_care_c,pat_lv_of_care,accommodation_c,accomodation,in_event_type_c,in_event_type,out_event_type_c,out_event_type,from_base_class_c,from_base_class,to_base_class_c,to_base_class,seq_num_in_enc,seq_num_in_bed_min
Unnamed: 0_level_1,<chr>,<lgl>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>
1,JC2a1bd68,,2019-12-24 23:20:00,1,1,1,0.5241561,0.2500841,1,Admission,112,Emergency Services,3,Emergency,100,Emergency,,,10001,Ward,1,Admission,,,0,,3,Emergency,1,1


In [None]:
cohort %>% distinct(pat_enc_csn_id_coded, first_label, death_24hr_recent_label) %>% count(first_label, death_24hr_recent_label, sort=T)
adt %>% count(event_type, event_type_c, sort=TRUE)
adt %>% count(pat_class, pat_class_c, sort=TRUE)
adt %>% count(base_pat_class, base_pat_class_c, sort=TRUE)
adt %>% count(pat_service, pat_service_c, sort=TRUE)
adt %>% count(pat_lv_of_care, pat_lvl_of_care_c, sort=TRUE)
adt %>% count(accomodation, accommodation_c, sort=TRUE)
adt %>% count(in_event_type, in_event_type_c, sort=TRUE)
adt %>% count(out_event_type, out_event_type_c, sort=TRUE)
adt %>% count(from_base_class, from_base_class_c, sort=TRUE)
adt %>% count(to_base_class, to_base_class_c, sort=TRUE)
adt %>% count(seq_num_in_enc, sort=TRUE)
adt %>% count(seq_num_in_bed_min, sort=TRUE)

first_label,death_24hr_recent_label,n
<int>,<int>,<int>
1,1,119
1,0,98
0,0,90
0,1,17


event_type,event_type_c,n
<chr>,<int>,<int>
Census,6,419221
Transfer In,3,305337
Transfer Out,4,305337
Patient Update,5,218436
Admission,1,69334
Discharge,2,68056


pat_class,pat_class_c,n
<chr>,<int>,<int>
Inpatient,126,1182999
Emergency Services,112,200344
Observation,128,1727
OP Surgery/Procedure,122,651


base_pat_class,base_pat_class_c,n
<chr>,<int>,<int>
,,1246039
Emergency,3.0,69334
Inpatient,1.0,69334
Outpatient,2.0,1014


pat_service,pat_service_c,n
<chr>,<int>,<int>
Critical Care,151.0,283991
Medicine,39.0,191213
Emergency,100.0,141033
Emergency Medicine,187.0,120906
General Medicine (University),153.0,87235
Trauma,72.0,86612
General Surgery,59.0,81189
Cardiology,155.0,61527
General Medicine (PAMF),154.0,30827
Neurosurgery,62.0,30067


pat_lv_of_care,pat_lvl_of_care_c,n
<chr>,<int>,<int>
Acute Care (Assessment or intervention q4-8),5.0,561858
Critical Care,8.0,404635
IICU/Intermediate Care (Assessment or intervention q2-4),9.0,218827
,,200282
Newborn Nursery - VC Only,68.0,119


accomodation,accommodation_c,n
<chr>,<int>,<int>
Ward,10001,563973
Private,1,556328
Semi-Private,2,265420


in_event_type,in_event_type_c,n
<chr>,<int>,<int>
Census,6.0,419221
,,373393
Transfer In,3.0,305085
Patient Update,5.0,148340
Admission,1.0,139682


out_event_type,out_event_type_c,n
<chr>,<int>,<int>
Census,6.0,419221
,,374671
Transfer Out,4.0,305085
Patient Update,5.0,148340
Discharge,2.0,138404


from_base_class,from_base_class_c,n
<chr>,<int>,<int>
Inpatient,1,1114462
Emergency,3,200344
,0,69334
,4,930
Outpatient,2,651


to_base_class,to_base_class_c,n
<chr>,<int>,<int>
Inpatient,1,1115775
Emergency,3,200092
,0,68056
,4,930
Outpatient,2,868


seq_num_in_enc,n
<int>,<int>
1,69334
2,69334
3,69334
4,69334
5,69334
6,68365
7,67843
8,67487
9,66696
10,64255


seq_num_in_bed_min,n
<int>,<int>
1,1369116
2,16292
3,313


In [23]:
demo <- read.csv(file.path(datadir, "4_1_diff_demographic.csv"))
dim(demo) # should have 69334
colnames(demo)

In [24]:
head(demo,1)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,INTRPTR_NEEDED_YN,CHARLSON_SCORE,N_HOSPITALIZATIONS,DAYS_IN_HOSPITAL
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<int>,<int>,<int>
1,JCd5f913,,False,11,14,96


In [20]:
# current diagnosis only, which we did not use for prediction
dx <- read.csv(file.path(datadir, "4_2_diff_dx_all.csv"))
dim(dx) # 8922
colnames(dx)

In [21]:
head(dx)

Unnamed: 0_level_0,anon_id,pat_enc_csn_id_coded,line,dx_name,primary,chronic,principal,hospital_pl,ed,present_on_adm
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,JCe340e5,131265564938,4,"Closed fracture of multiple ribs of right side, initial encounter",N,N,,,N,
2,JC2a28037,131272375066,2,Seizure disorder (CMS-HCC),N,N,,,Y,
3,JCd4a563,131281681345,6,Chronic atrial fibrillation,N,N,,,N,
4,JC2a2f2d2,131277062634,5,Acute pain,N,N,,,N,
5,JCd530d7,131284122975,1,"Chest pain, unspecified",,,,,,
6,JCcf7ee2,131277092263,5,"Intraparenchymal hematoma of brain with loss of consciousness of 30 minutes or less, unspecified laterality, initial encounter (CMS-HCC)",N,N,,,Y,


In [22]:
# only 120 of obs above 0.4

coh_dx <- cohort %>% distinct() %>% filter(abs_diff0_24 >= 0.4) %>% inner_join(dx) # %>% distinct()
nrow(coh_dx) #2152159 same as inner_join for threshold at 0.3 --> distinct 2148938, duplicates comes from cohort
length(unique(coh_dx$pat_enc_csn_id_coded))
length(unique(dx$pat_enc_csn_id_coded))
length(setdiff(coh_dx$pat_enc_csn_id_coded, dx$pat_enc_csn_id_coded))
length(setdiff(dx$pat_enc_csn_id_coded, coh_dx$pat_enc_csn_id_coded))

Joining, by = c("anon_id", "pat_enc_csn_id_coded")



In [23]:
summary(coh_dx$line)

coh_dx %>% count(primary, sort=TRUE)
coh_dx %>% count(chronic, sort=TRUE)
coh_dx %>% count(hospital_pl,  sort=TRUE)
coh_dx %>% count(principal, sort=TRUE)
coh_dx %>% count(ed, sort=TRUE)
coh_dx %>% count(present_on_adm, sort=TRUE)
coh_dx %>% count(dx_name, sort=TRUE) %>% head(20)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1.00    4.00   10.00   13.01   19.00   68.00    7961 

primary,n
<chr>,<int>
,704027
N,114155
Y,26915


chronic,n
<chr>,<int>
,696946
N,145296
Y,2855


hospital_pl,n
<chr>,<int>
,837136
Y,6830
N,1131


principal,n
<chr>,<int>
,837136
N,6940
Y,1021


ed,n
<chr>,<int>
,704027
N,93512
Y,47558


present_on_adm,n
<chr>,<int>
Yes,443982
Exempt from POA reporting,172122
,169246
No,57782
Unknown,1965


Unnamed: 0_level_0,dx_name,n
Unnamed: 0_level_1,<chr>,<int>
1,"Hyperlipidemia, unspecified",12290
2,Other long term (current) drug therapy,11556
3,Long term (current) use of insulin,10728
4,Atherosclerotic heart disease of native coronary artery without angina pectoris,9241
5,Type 1 diabetes mellitus with ketoacidosis without coma,9107
6,"Major depressive disorder, single episode, unspecified",7726
7,Essential (primary) hypertension,7445
8,Weakness,7238
9,Acidosis,7057
10,Hypertensive emergency,7055


In [24]:
coh_dx %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/120, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,"Hyperlipidemia, unspecified",48,40.0
2,Essential (primary) hypertension,44,36.67
3,Other long term (current) drug therapy,43,35.83
4,Long term (current) use of insulin,40,33.33
5,Weakness,28,23.33
6,Atherosclerotic heart disease of native coronary artery without angina pectoris,26,21.67
7,Personal history of nicotine dependence,26,21.67
8,Long term (current) use of aspirin,25,20.83
9,Acidosis,24,20.0
10,Dehydration,24,20.0


In [25]:
coh_dx %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name) %>% 
    mutate(perc = round(100*n/120, 2)) %>% arrange(-n) %>% head(20)

Unnamed: 0_level_0,dx_name,n,perc
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),6,5.0
2,Diabetic ketoacidosis without coma associated with diabetes mellitus due to underlying condition (CMS-HCC),5,4.17
3,Diabetic ketoacidosis without coma associated with other specified diabetes mellitus (CMS-HCC),5,4.17
4,Diabetic ketoacidosis without coma associated with type 2 diabetes mellitus (CMS-HCC),5,4.17
5,Hypertensive emergency,3,2.5
6,Hyponatremia,3,2.5
7,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",3,2.5
8,Acute upper GI bleeding,2,1.67
9,"Cerebrovascular accident (CVA), unspecified mechanism (CMS-HCC)",2,1.67
10,Diabetic ketoacidosis with coma associated with type 1 diabetes mellitus (CMS-HCC),2,1.67


In [69]:
cohed %>% filter(ed == "Y", primary=="Y") %>% count(dx_name, sort=TRUE) %>% head(20)

Unnamed: 0_level_0,dx_name,n
Unnamed: 0_level_1,<chr>,<int>
1,Septic shock (CMS-HCC),2358
2,Diabetic ketoacidosis without coma associated with type 1 diabetes mellitus (CMS-HCC),1985
3,Diabetic ketoacidosis without coma associated with diabetes mellitus due to underlying condition (CMS-HCC),1697
4,Hypertensive emergency,1320
5,Flash pulmonary edema (CMS-HCC),1243
6,Diabetic ketoacidosis without coma associated with other specified diabetes mellitus (CMS-HCC),1129
7,Diabetic ketoacidosis without coma associated with type 2 diabetes mellitus (CMS-HCC),930
8,Subdural hematoma (CMS-HCC),892
9,Acute respiratory failure with hypoxia and hypercapnia (CMS-HCC),802
10,"ST elevation myocardial infarction (STEMI), unspecified artery (CMS-HCC)",730
