# Benchmarks

## Initialize

In [None]:
#library(Rmisc)
#library(dtplyr)
library(tidyverse)
library(tidytable)
library(glue)
library(arrow)
library(patchwork)
#library(data.table)
library("jsonlite")
library(ggthemes)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = 230425
experiment_path = glue("{output_path}/{experiment}")

In [None]:
library(data.table)
library(arrow)

In [None]:
endpoints_md = fread(glue("{experiment_path}/endpoints.csv"), colClasses=c("phecode"="character"))
endpoints = sort(endpoints_md$endpoint)

In [None]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [None]:
endpoint_selection = c(
    
   'phecode_401', #  "Hypertension", # intervention
    'phecode_202', #  "Diabetes mellitus", # intervention
    'phecode_416-21', #  "Atrial fibrillation", # intervention
    'phecode_468', #  "Pneumonia", # intervention
    'phecode_474', #  "Chronic obstructive pulmonary disease [COPD]", # interventio
    'phecode_583', #  "Chronic kidney disease", # intervention
    
    'phecode_404', #  "Ischemic heart disease",
    'phecode_404-1', #  "Myocardial infarction [Heart attack]", # intervention
    'phecode_431-11', #  "Cerebral infarction [Ischemic stroke]",
    'phecode_424', #  "Heart failure", # intervention
    'phecode_420', #  "Cardiac arrest", # intervention
    'OMOP_4306655', #  "All-Cause Death", # intervention
    
    'phecode_438-11',   #  "Abdominal aortic aneurysm",
    'phecode_440-3',#  "Pulmonary embolism", # intervention
    'phecode_413-21',#  "Aortic stenosis", # intervention
    'phecode_413-11', #  "Mitral valve insufficiency",
    'phecode_410-2',#  "Endocarditis",
    'phecode_400',#  "Rheumatic fever and chronic rheumatic heart diseases",	
    
    'phecode_164', #  "Anemia", # intervention
    'phecode_718',  #  "Back pain", # intervention
    'phecode_324-11', #  "Parkinson's disease (Primary)",
    'phecode_705-1', #  "Rheumatoid arthritis", # NEW + interventio
    'phecode_665', #  "Psoriasis", # interesting
    'phecode_284'#  "Suicide ideation and attempt or self harm" # intervention
)
endpoint_defs = endpoint_defs %>% 
    mutate(name = phecode_string) %>%
    mutate(name = 
           case_when( 
               phecode_string == "Myocardial infarction [Heart attack]"~"Myocardial infarction",
               phecode_string == "Cerebral infarction [Ischemic stroke]"~"Ischemic stroke",
               phecode_string == "Chronic obstructive pulmonary disease [COPD]"~"COPD",
               phecode_string == "Mitral valve insufficiency"~"Mitral insufficiency",
               phecode_string == "Parkinson's disease (Primary)"~"Parkinson's",
               phecode_string == "Suicide ideation and attempt or self harm"~"Suicide attempt",
               phecode_string == "Ischemic heart disease"~"Ischemic HD",
               phecode_string == "Chronic kidney disease"~"Chronic KD",
               phecode_string == "Rheumatic fever and chronic rheumatic heart diseases"~"Rheumatic HD",
               phecode_string == "Abdominal aortic aneurysm"~"Abdominal AA",
                  TRUE ~ name)
           )
            
endpoint_map = endpoint_defs$name
names(endpoint_map) =  endpoint_defs$endpoint
#endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint
endpoint_order = endpoint_selection

In [None]:
str_replace_all(endpoint_selection, "\\-", "\\.")

In [None]:
endpoints_md %>% filter(endpoint %in% endpoint_selection) %>% as_tibble() %>% arrange(n)  %>%
    mutate(endpoint = recode(endpoint, !!!endpoint_map)) %>% mutate(perc = freq*100)

In [None]:
eligable_eids = arrow::read_feather(glue("{output_path}/eligable_eids_long_220627.feather")) %>% 
    filter(endpoint %in% endpoint_selection) %>% 
    mutate(endpoint = as.character(endpoint)) %>%
    mutate(eid = as.numeric(as.character(eid))) %>%
    mutate(included = 1)

In [None]:
data_outcomes = arrow::read_feather(glue("{output_path}/baseline_outcomes_long_220627.feather", as_data_frame=FALSE)) %>% 
    filter(endpoint %in% endpoint_selection) %>% left_join(eligable_eids, by=c("eid", "endpoint"))

In [None]:
records_per_individual = arrow::read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/records_per_individual_220627.feather")
eids_with_records = records_per_individual %>% filter(n_records > 0) %>% pull(eid)

# FROM LOGHS

In [None]:
prediction_paths = arrow::read_feather(glue("{experiment_path}/prediction_paths.feather")) %>% 
    filter(endpoint %in% endpoint_selection) %>% 
    filter(score == "MedicalHistory") %>%
    mutate(full_path = glue("{experiment_path}/coxph/predictions/{path}")) %>% 
    pull(full_path)

In [None]:
predictions = prediction_paths %>% map_df(
    ~suppressWarnings(read_feather(., col_select=c("eid", "endpoint", "features", "partition", "Ft_10")))) %>% 
    filter(eid %in% eids_with_records)
    #pivot_longer(endpoint_selection, names_to="endpoint", values_to="logh") %>%
    #mutate(eid = as.integer(as.character(eid))) %>%
    #select(endpoint, eid, logh))# %>% arrange(endpoint, eid)
predictions %>% head()

In [None]:
base_size = 8
title_size = 10
facet_size = 8.5
geom_text_size=3
theme_set(theme_classic(base_size = base_size) + 
          theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0), 
                strip.text.x = element_text(size = facet_size),axis.title=element_text(size=10), axis.text=element_text(size=8, color="black"),
                legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))

# Figure 2: Selected Endpoints

In [None]:
pred_outcomes = data_outcomes %>% 
    filter(eid %in% eids_with_records) %>%
    left_join(predictions, by=c("eid", "endpoint")) %>% as_tibble() %>% 
    filter(included==1) %>% left_join(endpoints_md) %>% filter(n==n) 

In [None]:
#pred_outcomes = predictions %>% left_join(data_outcomes, on=c(eid, endpoint)) %>% as_tibble()

## Endpoint Prevalence + Rate Ratios

In [None]:
pred_outcomes %>% filter(endpoint=="phecode_420") %>% filter(included==1) %>% mutate(perc = ntile(Ft_10, 100)) %>% filter(perc==100) #%>% summarise(mean(event))

In [None]:
logh_inc = pred_outcomes %>% filter(included==1) %>% group_by(endpoint) %>% mutate(logh_perc = ntile(Ft_10, 10)) %>% ungroup() %>% as_tibble()

In [None]:
logh_T_agg = logh_inc %>% group_by(endpoint, logh_perc) %>% summarise(n=sum(event), ratio = mean(event)) %>% as_tibble()

In [None]:
temp = logh_T_agg %>% mutate(endpoint = factor(endpoint, levels=endpoint_order)) %>% 
    filter(endpoint %in% endpoint_selection) %>% ungroup() #%>% sample_n(10000)
temp %>% filter(logh_perc %in% c(1, 10)) %>% 
    pivot_wider(names_from=logh_perc, values_from=c(n, ratio)) %>% 
    mutate(ratio=ratio_10/ratio_1) %>%
    mutate(endpoint = recode(endpoint, !!!endpoint_map)) %>%
    #mutate(ratio_1=ratio_1*100, ratio_10 = ratio_10*100) %>% 
    arrange(ratio)

## Endpoint selection for health state and incident disease rates

In [None]:
pred_outcomes = predictions %>% left_join(data_outcomes, on=c(eid, endpoint)) %>% as.data.table() 

## Example illustration with deciles

In [None]:
0.361170266/0.003323648

## Real Figure with percentiles

In [None]:
## exclude individuals without records!!

In [None]:
logh_inc = pred_outcomes %>% #''filter(partition %in% c(13, 14, 15, 16, 17, 18, 19, 20, 21)) %>% 
    filter(included==1) %>% group_by(endpoint) %>% mutate(logh_perc = ntile(Ft_10, 100)) %>% ungroup() %>% as_tibble()

In [None]:
logh_T_agg = logh_inc %>% group_by(endpoint, logh_perc) %>% summarise(n=sum(event), ratio = mean(event)) %>% as_tibble()

In [None]:
#logh_T_endpoint = logh_inc %>% group_by(endpoint) %>% summarise(n_all=sum(event), ratio_all = mean(event)) %>% as_tibble()

In [None]:
endpoint_order

In [None]:
library(lemon)

In [None]:
plot_width = 8.25; plot_height=4; plot_res = 320
options(repr.plot.width = plot_width, repr.plot.height = plot_height, repr.plot.res=plot_res)
#temp_rank = event_rest %>% filter(features=="Metabolomics") %>% arrange(desc(MET10PercvsREST))
#endpoint_order = (endpoint_defs %>% mutate(phecode_rank = as.numeric(phecode)) %>% arrange(phecode_rank) %>% as_tibble())$endpoint
temp = logh_T_agg %>% filter(endpoint %in% endpoint_selection) %>% mutate(endpoint = factor(endpoint, levels=endpoint_order)) %>% ungroup() %>% arrange(endpoint) #%>% sample_n(10000)

fig2a = ggplot(temp, aes(x=logh_perc, y=ratio*100, color=logh_perc)) + 
    labs(title=NULL, x="Risk Percentile (%)", y="Incident Events (%)") +
    geom_point(alpha=0.7, size=0.3) + 
    coord_cartesian(clip="off")+
    scale_colour_gradient(low = "#7AC6FF", high = "#023768", space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour")+
    scale_y_continuous(limits=c(0, NA), expand=expansion(mult=c(0, .05)))+#, limits=c(0, NA))+
    scale_x_continuous(expand=expansion(add=c(0, 1)))+
    facet_rep_wrap(~endpoint, scales="free_y", labeller = labeller(endpoint = as_labeller(endpoint_map, default=label_wrap_gen(22))), ncol=6) + 
    theme(legend.position="none") + theme(panel.spacing.y = unit(-0.7, "lines"), strip.clip = "off", panel.border = element_blank())
fig2a

In [None]:
endpoints_md %>% filter(str_detect(phecode_string, "arrest"))

In [None]:
temp %>% filter(endpoint == "phecode_420") %>% arrange(logh_perc)# %>% filter(logh_perc==100)

# Kaplan Mayer

In [None]:
logh_mh = logh_inc %>% select(endpoint, eid, logh_perc, event, time) %>% group_by(endpoint) %>% 
    mutate(MH=case_when(logh_perc %in% 100 ~ "High", 
                        logh_perc %in% 50 ~ "Mid", 
                        logh_perc %in% 1 ~ "Low",
                        TRUE ~ "NA")
          ) %>% mutate(MH=fct_rev(factor(MH, levels=c("Low", "Mid", "High")))) %>% ungroup() %>% 
    filter(MH!="NA") #%>% select(eid, endpoint, logh, logh_group)

In [None]:
require("ggquickeda")
plot_width = 8.25; plot_height=4; plot_res = 320
options(repr.plot.width = plot_width, repr.plot.height = plot_height, repr.plot.res=plot_res)
met_map = c("High"="#023768", "Mid"="#4F8EC1", "Low"="#7AC6FF")

temp = logh_mh %>% mutate(endpoint = factor(endpoint, levels=endpoint_order)) %>% filter(endpoint %in% endpoint_selection) %>% ungroup() 

fig2b = ggplot(temp, aes(time = time, status = event, fill=factor(MH), color=factor(MH), group=factor(MH))) +
    geom_km(trans = "event") + 
    geom_kmticks(trans = "event", size=0.2, alpha=0.01) + 
    geom_kmband(trans = "event") + 
    labs(x="Time (Years)", y="Cumulative Events (%)")+
    coord_cartesian(clip="off")+
    scale_color_manual(values=met_map)+scale_fill_manual(values=met_map)+
    scale_y_continuous(labels = function(x) round(x*100, 1), expand=c(0, 0))+
    scale_x_continuous(expand=expansion(add=c(0, .1)), breaks=c(5, 10))+
    facet_rep_wrap(~endpoint, scales="free_y", labeller = labeller(endpoint = as_labeller(endpoint_map, default=label_wrap_gen(22))), ncol=6) + 
    theme(legend.position="none") + theme(panel.spacing.y = unit(-0.7, "lines"), strip.clip = "off", panel.border = element_blank())
fig2b

In [None]:
plot_width = 8.25; plot_height=8; plot_res = 320
options(repr.plot.width = plot_width, repr.plot.height = plot_height, repr.plot.res=plot_res)

fig2bc = (fig2a / fig2b) & theme(panel.border = element_blank())
fig2bc 

In [None]:
library(gt)
plot_name = "Figure2bc"
fig2bc %>% ggsave(filename=glue("outputs/{plot_name}.pdf"), device="pdf", width=plot_width, height=plot_height, dpi=plot_res)