## Descriptions:
- Check the characteristics of the cohort with predicted probability difference = 0.3) and transfer from 0 to 1

**Inputs**:  
- `1_4_cohort_test_results`
- `1_4_cohort_all_current_dx`
    
**Outputs**: (did not save anything here)
- `4_3_diff23tx01IDs` cohort test of abs_diff from 0.2 to 0.3 and transferred from 0 to 1 --> to re-id

### Importing R libraries

In [1]:
library(bigrquery)  # to query STARR-OMOP (stored in BigQuery) using SQL
library(tidyverse)
library(lubridate)

# options(repr.matrix.max.rows=250, repr.matrix.max.cols=30)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




In [2]:
# directories
datadir = "../../DataTD"
cohortdir = "../../OutputTD/1_cohort"
featuredir = "../../OutputTD/2_features"
modeldir4 = "../../OutputTD/3_models/1_4_cohort"
# modeldir4preadmit = "../../OutputTD/3_models/1_4_cohort_24hrpreadmit"
tabledir = "../../OutputTD/4_tables"

options(repr.matrix.max.rows=200, repr.matrix.max.cols=30)

In [3]:
cohort <- read.csv(file.path(modeldir4, "1_4_cohort_test_results.csv")) %>% distinct() %>% 
            select(anon_id, pat_enc_csn_id_coded, admit_time, 
                   first_label, death_24hr_recent_label, death_24hr_max_label,
                   pred_first, pred_death_24hr_recent, transfer, 
                   abs_diff0_24, diff0_True, diff24_True) 
nrow(cohort) # 2604
nrow(cohort %>% distinct())
nrow(cohort %>% select(anon_id) %>% distinct()) # 2328
nrow(cohort %>% select(pat_enc_csn_id_coded) %>% distinct()) # 2604

# count anon_id and csn, count unique in each group, but possible to have same ids in different groups
cohort %>%  distinct() %>% group_by(abs_diff0_24) %>% 
            summarise(nrows = n(), count_csn = n_distinct(pat_enc_csn_id_coded), count_mrn = n_distinct(anon_id)) %>% 
            arrange(-abs_diff0_24) %>% mutate(cum_csn = cumsum(count_csn), cum_mrn = cumsum(count_mrn))

abs_diff0_24,nrows,count_csn,count_mrn,cum_csn,cum_mrn
<dbl>,<int>,<int>,<int>,<int>,<int>
0.7,3,3,3,3,3
0.6,7,7,7,10,10
0.5,27,27,24,37,34
0.4,80,80,77,117,111
0.3,206,206,204,323,315
0.2,545,545,525,868,840
0.1,1736,1736,1620,2604,2460
0.0,9814,9814,7979,12418,10439


In [7]:
# get cohort with abs_diff from 0.2 to 0.3 and with transfer
sum(cohort$transfer)

nrow(cohort %>% filter(abs_diff0_24 >=0.2, abs_diff0_24 <=0.3, transfer==1) %>% distinct(pat_enc_csn_id_coded))


nrow(cohort %>% filter(abs_diff0_24 >=0.2, abs_diff0_24 <=0.3, first_label == 1, death_24hr_recent_label ==0) %>% 
                distinct(pat_enc_csn_id_coded)) # 139

nrow(cohort %>% filter(abs_diff0_24 >=0.2, abs_diff0_24 <=0.3, first_label == 0, death_24hr_recent_label ==1) %>% 
                distinct(pat_enc_csn_id_coded)) #36

diff23tx01 <- cohort %>% filter(abs_diff0_24 >=0.2, abs_diff0_24 <=0.3, first_label == 0, death_24hr_recent_label ==1) %>% 
                distinct(anon_id) # 89 csn for absdiff from 0.1 to 0.3 and 36 csn for 0.2 to 0.3

nrow(diff23tx01)
# write.csv(diff23tx01, file.path(tabledir, "4_3_diff23tx01IDs_35.csv"), row.names=FALSE)

### Checking diagnosis to compare with total cohort

In [5]:
# all dx from 1_4_cohort, SQL query
dx <- read.csv(file.path(datadir, "1_4_cohort_all_current_dx.csv"))
nrow(dx) #1239918

ndx = length(unique(dx$pat_enc_csn_id_coded)) # full cohort is 43980
ndx

In [6]:
# cohort transfers with diagnoses
dx_diff <- left_join(diff23tx01, dx)
nrow(dx_diff)
ndf = length(unique(dx_diff$pat_enc_csn_id_coded))
ndf
length(unique(dx_diff$anon_id))

dx_diff %>% distinct(pat_enc_csn_id_coded, dx_name) %>% count(dx_name, sort=TRUE) %>% head(20)

Joining, by = "anon_id"



Unnamed: 0_level_0,dx_name,n
Unnamed: 0_level_1,<chr>,<int>
1,Other long term (current) drug therapy,37
2,Type 2 diabetes mellitus with diabetic chronic kidney disease,28
3,"Acute kidney failure, unspecified",26
4,"Hyperlipidemia, unspecified",25
5,Acidosis,24
6,Personal history of nicotine dependence,24
7,Long term (current) use of insulin,23
8,Weakness,22
9,Atherosclerotic heart disease of native coronary artery without angina pectoris,21
10,"Hypertensive heart and chronic kidney disease with heart failure and stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease",21


In [41]:
# only 117 of obs above 0.4
# all visits in diff4 have diagnoses

ndfn = nrow(dx_diff %>% distinct(pat_enc_csn_id_coded, dx_name))
ndfn

length(setdiff(dx_diff$pat_enc_csn_id_coded, dx$pat_enc_csn_id_coded))
length(setdiff(dx$pat_enc_csn_id_coded, dx_diff$pat_enc_csn_id_coded))

colnames(dx_diff)

In [8]:
summary(dx_diff$line)

dx_diff %>% count(primary, sort=TRUE)
dx_diff %>% count(chronic, sort=TRUE)
dx_diff %>% count(hospital_pl,  sort=TRUE)
dx_diff %>% count(principal, sort=TRUE)
dx_diff %>% count(ed, sort=TRUE)
dx_diff %>% count(present_on_adm, sort=TRUE)
# dx_diff %>% count(dx_name, sort=TRUE) %>% head(20)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    1.0     4.0    10.0    13.1    20.0    70.0      67 

primary,n
<chr>,<int>
,2472
N,514
Y,102


chronic,n
<chr>,<int>
,2436
N,641
Y,11


hospital_pl,n
<chr>,<int>
,3021
Y,50
N,17


principal,n
<chr>,<int>
,3021
N,60
Y,7


ed,n
<chr>,<int>
,2503
N,434
Y,151


present_on_adm,n
<chr>,<int>
Yes,1544
,745
Exempt from POA reporting,505
No,276
Unknown,18


In [9]:
dx_diff %>% filter(ed=="Y", primary=="Y") %>% distinct(pat_enc_csn_id_coded, dx_name) %>% 
            count(dx_name, sort=TRUE) %>% filter(n>1)

dx_name,n
<chr>,<int>
Hypoxia,3
"Acute on chronic heart failure, unspecified heart failure type (CMS-HCC)",2
Cholangiocarcinoma (CMS-HCC),2
HCAP (healthcare-associated pneumonia),2
"Motor vehicle collision, initial encounter",2
Syncope and collapse,2
