# Benchmarks

## Initialize

In [None]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = 220613
experiment_path = glue("{output_path}/{experiment}")

In [None]:
endpoints_md = fread(glue("{output_path}/220531/endpoints.csv"), colClasses = c("phecode"="character")) %>% 
    select(-ICD10_only, -phecode_top, -leaf) %>% as_tibble()
endpoints = sort(endpoints_md$endpoint)

In [None]:
name = "benchmarks_cindex_220615"
benchmark_endpoints = arrow::read_feather(glue("{experiment_path}/{name}.feather"))# %>% left_join(endpoint_defs) 

In [None]:
benchmark_clean = benchmark_endpoints %>% 
    filter(score %in% c("Age+Sex", "Age+Sex+MedicalHistory")) %>% 
    pivot_wider(names_from="score", values_from="cindex") %>% 
    mutate(delta=`Age+Sex+MedicalHistory`-`Age+Sex`)
benchmark_agg = benchmark_clean %>% group_by(endpoint) %>% summarise(across(all_of(c("Age+Sex+MedicalHistory", "Age+Sex", "delta")), ~ median(.)))

In [None]:
do_md = endpoints_md %>% #mutate(endpoint = as.character(endpoint)) %>%
    left_join(benchmark_agg) %>% as_tibble()

In [None]:
## cardio endpoints with interventions

options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
do_md %>% 
    #filter(str_detect(phecode_category, "Cardio")) %>%
    #filter(str_detect(phecode_string, "aneurysm")) %>%
    filter(str_detect(phecode, "438")) %>%
    #filter(!str_detect(phecode, "\\.")) %>%
#filter(freq>0.001) %>%
    #filter(str_detect(phecode_string, "Embolism")) %>%
    #sample_n(10) 
    #filter(delta>0.05) %>% 
    arrange(desc(freq)) #%>% arrange(desc(delta))
    #arrange(desc(ratio))

In [None]:
unique(do_md$phecode_category)

In [None]:
endpoint_selection_order = c(
    
        "Hypertension", # intervention
        "Diabetes mellitus", # intervention
        "Atrial fibrillation", # intervention
        "Ischemic heart disease",
        "Myocardial infarction [Heart attack]", # intervention
        "Cerebral infarction [Ischemic stroke]",
        "Heart failure", # intervention
        "Pneumonia", # intervention
        "Chronic obstructive pulmonary disease [COPD]", # interventio
        "Chronic kidney disease", # intervention
        "Cardiac arrest", # intervention
        "All-Cause Death", # intervention
                
        "Aortic stenosis", # intervention
        "Mitral valve insufficiency",
        "Endocarditis",
        "Pulmonary embolism", # intervention
        "Abdominal aortic aneurysm",
        "Rheumatic fever and chronic rheumatic heart diseases",	
        
        "Back pain", # intervention
        "Anemia", # intervention
        "Rheumatoid arthritis", # NEW + interventio
        "Psoriasis", # interesting
        "Parkinson's disease (Primary)",
        "Suicide ideation and attempt or self harm" # intervention
)

endpoint_selection = do_md %>% 
    filter(phecode_string %in% endpoint_selection_order) %>%
    arrange(as.numeric(phecode)) %>%
    mutate(phecode_string=factor(phecode_string, levels=endpoint_selection_order)) %>% 
    arrange(phecode_string)
endpoint_selection

In [None]:
cat(paste0("'", endpoint_selection$endpoint, "'\n"))

In [None]:
unique(do_md$phecode_category)

In [None]:
do_md %>% 
    #filter(leaf==0) %>%
    filter(delta<0.02) %>%
    arrange(delta)# %>% 
    #filter(str_detect(phecode_string, "ardio"))
    #filter(phecode_category == "Resp")