# Benchmarks

## Initialize

In [1]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to fo

In [2]:
base_path = "/home/jakobs"

project_path = glue("{base_path}/data")
experiment = '231117'
experiment_path = glue("{project_path}/{experiment}")

endpoints_md = endpoint_defs = arrow::read_feather(glue("{base_path}/data/endpoints_epic_md.feather")) %>% filter(n_epic>=100)
endpoints = endpoint_defs$endpoint

In [3]:
today = experiment

In [4]:
base_size = 8
title_size = 10
facet_size = 10
geom_text_size=3
theme_set(theme_classic(base_size = base_size) + 
          theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0), 
                strip.text.x = element_text(size = facet_size),axis.title=element_text(size=10), axis.text=element_text(size=8, color="black"),
                legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2), panel.grid.major=element_line()))

“[1m[22mThe `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
[36mℹ[39m Please use the `linewidth` argument instead.”


In [6]:
endpoint_map = endpoint_defs$phecode_string
names(endpoint_map) =  endpoint_defs$endpoint
endpoint_order = endpoints

In [7]:
endpoint_selection = c(
    # generally very important
    "phecode_202", # Diabetes mellitus
    "phecode_401",	#Hypertension"  
    "phecode_404", # Ischemic heart disease   
    "phecode_404-1", # Myocardial infarction [Heart attack]
    "phecode_431-11", # Cerebral infarction [Ischemic stroke]
    "phecode_424", # Heart failure

    
    "phecode_059-1", # COVID 19
    "phecode_468", # Pneumonia
    "phecode_474", # Chronic obstructive pulmonary disease [COPD]
      
    "phecode_286-2", #	Major depressive disorder
    "phecode_324-11", #Parkinson's Disease
    "phecode_328", # Dementias and cerebral degeneration

    
    "phecode_164", # Anemia
    "phecode_726-1", # Osteoporosis
    "phecode_371", # Cataract
    "phecode_374-42", # Diabetic retinopathy
    "phecode_374-5", # Macular degeneration
    "phecode_375-1", # Glaucoma
    
    
    "phecode_103", # Malignant neoplasm of the skin
    "phecode_101", # Malignant neoplasm of the digestive organs
    "phecode_102", # LUNG CANCER
    
    "phecode_583", # Chronic kidney disease    
    "phecode_542", # Chronic liver disease and sequelae    
    "OMOP_4306655" # All-Cause Death
    
    # also generally important and relevant
    #"phecode_440-3", # Pulmonary embolism
    #"phecode_468-1",	#Viral pneumonia
#     "phecode_460-2",	#Acute lower respiratory infection
    #"phecode_388" # Blindness and low vision
      # generally important and fun to check
   # "phecode_374-3", # Retinal vascular changes and occlusions
    #"phecode_665", # Psoriasis
#     "phecode_121", # Leukemia
    # important for eye
#     "phecode_705-1", # Rheumatoid arthritis
)

endpoints_common = c(
   'phecode_164', #Anemia
 'phecode_705-1', #Rheumatoid arthritis
 'phecode_328', #Dementias and cerebral degeneration
 'phecode_328-1', #Alzheimer's disease
 'phecode_401', #Hypertension
 'phecode_202', #Diabetes mellitus
 'phecode_416-21', #Atrial fibrillation
 'phecode_404-1', #Myocardial infarction [Heart attack]
 'phecode_424', #Heart failure
 'phecode_468', #Pneumonia
 'phecode_474', #Chronic obstructive pulmonary disease [COPD]
 'phecode_583', #Chronic kidney disease
 'OMOP_4306655' #All-Cause Death
    )
    
endpoints_cardio = c(
    'phecode_438-11',   #  "Abdominal aortic aneurysm",
    'phecode_440-3',#  "Pulmonary embolism", # intervention
    'phecode_413-21',#  "Aortic stenosis", # intervention
    'phecode_400'#  "Rheumatic fever and chronic rheumatic heart diseases",	
)

endpoints_eye= c(
    'phecode_374-5', #Macular degeneration
 'phecode_374-51', #Age-related macular degeneration
 'phecode_374-42', #Diabetic retinopathy
 'phecode_371', #Cataract
 'phecode_388', #Blindness and low vision
 'phecode_367-5', #Uveitis
 'phecode_389-1' #Ocular pain
)

In [8]:
endpoint_defs = endpoint_defs %>% 
    mutate(name = phecode_string) %>%
    mutate(name = 
           case_when( 
               phecode_string == "Myocardial infarction [Heart attack]"~"Myocardial infarction",
               phecode_string == "Cerebral infarction [Ischemic stroke]"~"Ischemic stroke",
               phecode_string == "Chronic obstructive pulmonary disease [COPD]"~"Chronic obstructive pulmonary disease",
               phecode_string == "Mitral valve insufficiency"~"Mitral insufficiency",
               phecode_string == "Parkinson's disease (Primary)"~"Parkinson's disease",
               phecode_string == "Suicide ideation and attempt or self harm"~"Suicide attempt",
               phecode_string == "Ischemic heart disease"~"Coronary heart disease",
               phecode_string == "Chronic kidney disease"~"Chronic kidney disease",
               phecode_string == "Rheumatic fever and chronic rheumatic heart diseases"~"Rheumatic heart disease",
               phecode_string == "Abdominal aortic aneurysm"~"Abdominal aortic aneurysm",
                  TRUE ~ name)
           )
            
endpoint_map = endpoint_defs$name
names(endpoint_map) =  endpoint_defs$endpoint
#endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint
endpoint_order = endpoint_selection

In [9]:
endpoints_cardio = c(
    'phecode_431-11', #  "Cerebral infarction [Ischemic stroke]",
    'phecode_404', #  "Ischemic heart disease",
    'phecode_404-1', #  "Myocardial infarction [Heart attack]", # intervention
    'phecode_424', #  "Heart failure", # intervention
     'OMOP_4306655', #  "All-Cause Death", # intervention
    'phecode_420' #  "Cardiac arrest", # intervention
   
)

## Load data

# Load Benchmarks

In [10]:
name = "hrs_endpoints"
hrs_endpoints = arrow::read_feather(glue("{experiment_path}/{name}.feather"), 
                                    col_select=c("endpoint", "score", "partition", "hrs_ret", "hrs_ret_age", "hrs_ret_sex")) %>% 
    left_join(endpoint_defs) %>% filter(n_epic>=100)

[1m[22mJoining with `by = join_by(endpoint)`


In [11]:
hrs_endpoints

endpoint,score,partition,hrs_ret,hrs_ret_age,hrs_ret_sex,n_epic,eligable,n,freq,phecode_string,phecode_category,name
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
OMOP_4306655,ASCVD+Retina,0,1.3075194,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,1,1.2531733,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,2,1.3580517,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,3,1.3436590,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,4,1.2810884,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,5,1.2268150,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,6,1.2530525,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,7,0.8693816,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,8,1.3288605,,,111,61213,3471,0.05670364,Death,Event,Death
OMOP_4306655,ASCVD+Retina,9,1.1898374,,,111,61213,3471,0.05670364,Death,Event,Death


In [12]:
hrs = hrs_endpoints %>% 
    group_by(endpoint, score, phecode_string, phecode_category) %>% 
    #filter(score %in% c("Retina", "Age+Sex+Retina", "Age+Sex+Retina+I(Age*ret)")) %>%
    #mutate(endpoint = factor(endpoint, levels=endpoints_sorted)) %>%
    #mutate(phecode_category = factor(phecode_category, levels=pcats_sorted)) %>%
    select(endpoint, score, hrs_ret, hrs_ret_age, hrs_ret_sex, phecode_string, phecode_category) %>%
    ungroup()
hrs

endpoint,score,hrs_ret,hrs_ret_age,hrs_ret_sex,phecode_string,phecode_category
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
OMOP_4306655,ASCVD+Retina,1.3075194,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.2531733,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.3580517,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.3436590,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.2810884,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.2268150,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.2530525,,,Death,Event
OMOP_4306655,ASCVD+Retina,0.8693816,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.3288605,,,Death,Event
OMOP_4306655,ASCVD+Retina,1.1898374,,,Death,Event


In [13]:
unique(hrs$score)

In [14]:
purrr::map_dfr(integer(100), ~ hrs %>% group_by(endpoint, score) %>% sample_frac(1, replace=TRUE), .id="i") -> temp_hrs

In [15]:
library(ggdist)

In [16]:
temp_hrs_ret = temp_hrs %>% group_by(endpoint, score, phecode_string, phecode_category) %>% median_qi(hrs_ret, hrs_ret_age, hrs_ret_sex, na.rm=TRUE)

In [17]:
temp_hrs_linear = temp_hrs %>% ungroup() %>%
    filter(score %in% c("Retina",
                        "Age+Sex+Retina"
#                        "SCORE2+Retina",
#                        "ASCVD+Retina",
#                        "QRISK3+Retina"
                       )) %>%
    group_by(endpoint, score, phecode_string, phecode_category) %>% 
    median_qi(`hrs_ret`, na.rm=TRUE) %>% rename(hr = `hrs_ret`) %>%
    mutate(hr = round(hr, 2), .lower=round(.lower, 2), .upper=round(.upper, 2)) %>%#c(hr, .lower, .upper), round, 1) %>%
    mutate(HR_Retina = as.character(glue("{hr} ({.lower}, {.upper})"))) %>% 
    select(endpoint, score, phecode_string, phecode_category, HR_Retina) %>%
    pivot_wider(names_from="score", values_from="HR_Retina") %>% 
    select(all_of(c("phecode_category", "endpoint", "phecode_string", "Retina", 'Age+Sex+Retina'
#                     "SCORE2+Retina", "ASCVD+Retina","QRISK3+Retina"
                   ))) %>% 
    #mutate(endpoint = factor(endpoint, levels = endpoint_order)) %>% 
    arrange(endpoint) %>%
    #mutate(endpoint = recode(endpoint, !!!endpoint_map)) %>%
    ungroup()
    
temp_hrs_linear

phecode_category,endpoint,phecode_string,Retina,Age+Sex+Retina
<chr>,<chr>,<chr>,<chr>,<chr>
Event,OMOP_4306655,Death,"3.29 (2.68, 3.56)","0.86 (0.77, 1.12)"
ID,phecode_089,Infections,"5.14 (4.8, 5.6)","2.29 (2.11, 2.51)"
ID,phecode_089-1,Bacterial infections,"5.24 (4.8, 5.83)","2.36 (2.09, 2.59)"
ID,phecode_089-2,Viral infections,"3.48 (3.26, 3.92)","1.46 (1.18, 1.73)"
ID,phecode_089-3,Fungal infections,"2.95 (2.58, 3.42)","2.47 (1.86, 3.03)"
ID,phecode_092,"Bacteremia, Sepsis, and SIRS","2.74 (2.6, 2.86)","1.21 (1.09, 1.34)"
ID,phecode_092-2,Sepsis,"2.76 (2.61, 2.86)","1.16 (1.08, 1.32)"
Neoplasms,phecode_101,Malignant neoplasm of the digestive organs,"3.08 (2.65, 3.36)","2 (1.74, 2.21)"
Neoplasms,phecode_101-4,Malignant neoplasm of the colon and rectum,"2.54 (2.28, 2.75)","1.56 (1.41, 1.68)"
Neoplasms,phecode_101-41,Malignant neoplasm of the colon,"2.05 (1.76, 2.31)","1.11 (0.95, 1.27)"


In [18]:
ukb = fread("UKB_SupplTable2_HRsLinearAll_CropRatio-0.66.csv")

In [20]:
epic = temp_hrs_linear %>% select(-phecode_category, -phecode_string)
colnames(epic) = c("endpoint", "epic_unadjusted", "epic_agesex")
epic

endpoint,epic_unadjusted,epic_agesex
<chr>,<chr>,<chr>
OMOP_4306655,"3.29 (2.68, 3.56)","0.86 (0.77, 1.12)"
phecode_089,"5.14 (4.8, 5.6)","2.29 (2.11, 2.51)"
phecode_089-1,"5.24 (4.8, 5.83)","2.36 (2.09, 2.59)"
phecode_089-2,"3.48 (3.26, 3.92)","1.46 (1.18, 1.73)"
phecode_089-3,"2.95 (2.58, 3.42)","2.47 (1.86, 3.03)"
phecode_092,"2.74 (2.6, 2.86)","1.21 (1.09, 1.34)"
phecode_092-2,"2.76 (2.61, 2.86)","1.16 (1.08, 1.32)"
phecode_101,"3.08 (2.65, 3.36)","2 (1.74, 2.21)"
phecode_101-4,"2.54 (2.28, 2.75)","1.56 (1.41, 1.68)"
phecode_101-41,"2.05 (1.76, 2.31)","1.11 (0.95, 1.27)"


In [21]:
st6 = ukb %>% left_join(epic, by="endpoint") %>% as_tibble() %>% arrange(endpoint)

In [22]:
st6_clean <- st6 %>%
  mutate(across(everything(), ~ replace_na(., ""), .names = "{col}"))

In [23]:
st6_clean

endpoint,phecode_string,Retina,Age+Sex+Retina,SCORE2+Retina,ASCVD+Retina,QRISK3+Retina,epic_unadjusted,epic_agesex
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
OMOP_4306655,All-Cause Death,"3.32 (3.22, 3.78)","2.73 (2.61, 3)","2.47 (2.34, 2.67)","2.42 (2.33, 2.64)","2.35 (2.23, 2.51)","3.29 (2.68, 3.56)","0.86 (0.77, 1.12)"
phecode_002,Staphylococcus,"3.39 (3.26, 3.89)","3.67 (3.27, 4.04)","3.26 (3.02, 3.74)","3.21 (2.83, 3.39)","2.85 (2.58, 3.1)",,
phecode_002-1,Staphylococcus aureus,"3.2 (3.12, 4.06)","3.48 (3.24, 4.24)","3.33 (3.06, 3.93)","3.25 (2.91, 3.65)","2.97 (1.47, 3.46)",,
phecode_003,Escherichia coli,"3.24 (3.09, 3.62)","3.59 (3.55, 4.38)","3.41 (3.29, 4.21)","3.29 (3.1, 4.13)","2.91 (2.72, 3.69)",,
phecode_004,Streptococcus,"3.51 (3.34, 4.07)","4.07 (3.91, 4.58)","3.89 (3.55, 4.16)","3.81 (3.33, 4.05)","3.39 (2.92, 3.64)",,
phecode_009,Pseudomonas,"3.58 (3.13, 3.92)","3.35 (2.51, 4.33)","3.16 (2.17, 3.86)","3.16 (2.13, 3.7)","2.65 (1.89, 3.03)",,
phecode_011,Klebsiella,"3.55 (3.15, 3.87)","3.86 (3.48, 5)","3.17 (3.13, 4.76)","3.14 (3.04, 4.7)","2.74 (2.64, 4.2)",,
phecode_015,Clostridium,"3.65 (3.1, 4.19)","4.46 (3.46, 4.93)","4.15 (3.29, 4.49)","3.64 (3.17, 4.34)","3.36 (2.92, 4.02)",,
phecode_015-2,Clostridium difficile,"3.94 (3.01, 4.2)","4.28 (3.38, 5.06)","3.65 (3.23, 4.68)","3.63 (3.13, 4.47)","3.25 (2.87, 4.16)",,
phecode_052,Herpesvirus,"3.38 (3.23, 4.09)","4.82 (4.41, 5.61)","4.7 (4.44, 5.75)","5.24 (4.69, 7.61)","4.56 (1.24, 7.52)",,


In [24]:
st6_clean %>% write_csv("outputs/SupplTable6_ukbepic_merged.csv")

In [34]:
st6_clean %>% select(endpoint, phecode_string, `Age+Sex+Retina`, epic_agesex) %>%  mutate(
    AgeSexRetina_CI = str_match(`Age+Sex+Retina`, "\\(([^,]+), ([^)]+)\\)"),
    epic_agesex_CI = str_match(epic_agesex, "\\(([^,]+), ([^)]+)\\)")
  ) %>%
  mutate(
    AgeSexRetina_lower = as.numeric(AgeSexRetina_CI[, 2]),
    AgeSexRetina_upper = as.numeric(AgeSexRetina_CI[, 3]),
    epic_agesex_lower = ifelse(epic_agesex == "", NA, as.numeric(epic_agesex_CI[, 2])),
    epic_agesex_upper = ifelse(epic_agesex == "", NA, as.numeric(epic_agesex_CI[, 3])),
    overlap = case_when(
      is.na(AgeSexRetina_lower) | is.na(epic_agesex_lower) ~ NA_character_,
      AgeSexRetina_upper < epic_agesex_lower | epic_agesex_upper < AgeSexRetina_lower ~ "-",
      AgeSexRetina_lower <= epic_agesex_upper & epic_agesex_lower <= AgeSexRetina_upper ~ "+",
      TRUE ~ NA_character_
    )
  ) %>%
  select(-AgeSexRetina_CI, -epic_agesex_CI) %>%
    filter(!is.na(epic_agesex_upper)) %>% filter(overlap=="+") %>%
    select(-AgeSexRetina_lower, -AgeSexRetina_upper, -epic_agesex_lower, -epic_agesex_upper)

endpoint,phecode_string,Age+Sex+Retina,epic_agesex,overlap
<chr>,<chr>,<chr>,<chr>,<chr>
phecode_089-3,Fungal infections,"3.07 (2.81, 3.79)","2.47 (1.86, 3.03)",+
phecode_101,Malignant neoplasm of the digestive organs,"2.15 (1.95, 2.26)","2 (1.74, 2.21)",+
phecode_101-4,Malignant neoplasm of the colon and rectum,"1.91 (1.66, 1.98)","1.56 (1.41, 1.68)",+
phecode_105,Malignant neoplasm of the breast,"1.08 (1.04, 1.24)","1.35 (1.22, 1.42)",+
phecode_112,Malignant neoplasm of other and ill-defined sites,"1.69 (1.58, 1.91)","1.57 (1.42, 1.69)",+
phecode_136,Benign neoplasm of the digestive organs,"2 (1.85, 2.18)","1.94 (1.53, 2.31)",+
phecode_138,Benign neoplasm of the skin,"3.41 (2.8, 3.91)","3.43 (1.73, 13.03)",+
phecode_181,Autoimmune disease,"2.91 (2.65, 3.46)","3.88 (3.14, 4.77)",+
phecode_200,Disorders of thyroid gland,"1.85 (1.66, 1.99)","1.92 (1.64, 1.99)",+
phecode_200-1,Hypothyroidism,"1.85 (1.68, 1.98)","1.79 (1.52, 1.88)",+


In [25]:
# new, after bugfix
temp_hrs_linear %>% filter(endpoint %in% endpoint_selection) %>% select(-phecode_category)

endpoint,phecode_string,Retina,Age+Sex+Retina
<chr>,<chr>,<chr>,<chr>
OMOP_4306655,Death,"3.29 (2.68, 3.56)","0.86 (0.77, 1.12)"
phecode_101,Malignant neoplasm of the digestive organs,"3.08 (2.65, 3.36)","2 (1.74, 2.21)"
phecode_103,Malignant neoplasm of the skin,"3.37 (3.12, 3.53)","1.2 (1.15, 1.49)"
phecode_164,Anemia,"3.22 (3.09, 3.39)","1.71 (1.57, 1.75)"
phecode_202,Diabetes mellitus,"2.39 (2.28, 2.52)","2.03 (1.98, 2.25)"
phecode_286-2,Major depressive disorder,"2.02 (1.7, 2.2)","1.36 (1.08, 1.48)"
phecode_328,Dementias and cerebral degeneration,"3.67 (3.55, 3.94)","1.48 (1.39, 1.61)"
phecode_371,Cataract,"3.02 (2.93, 3.06)","2.72 (2.62, 2.77)"
phecode_401,Hypertension,"2.96 (2.89, 2.99)","1.62 (1.56, 1.7)"
phecode_404,Ischemic heart disease,"2.81 (2.68, 2.96)","1.19 (1.11, 1.27)"
