# Benchmarks

## Initialize

In [None]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = 220627
experiment_path = glue("{output_path}/{experiment}")

In [None]:
library(data.table)
library(arrow)

In [None]:
endpoints_md = fread(glue("{experiment_path}/endpoints.csv"), colClasses=c("phecode"="character"))
endpoints = sort(endpoints_md$endpoint)

In [None]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [None]:
endpoint_selection = c(
    
   'phecode_401', #  "Hypertension", # intervention
    'phecode_202', #  "Diabetes mellitus", # intervention
    'phecode_416-21', #  "Atrial fibrillation", # intervention
    'phecode_468', #  "Pneumonia", # intervention
    'phecode_474', #  "Chronic obstructive pulmonary disease [COPD]", # interventio
    'phecode_583', #  "Chronic kidney disease", # intervention
    
    'phecode_404', #  "Ischemic heart disease",
    'phecode_404-1', #  "Myocardial infarction [Heart attack]", # intervention
    'phecode_431-11', #  "Cerebral infarction [Ischemic stroke]",
    'phecode_424', #  "Heart failure", # intervention
    'phecode_420', #  "Cardiac arrest", # intervention
    'OMOP_4306655', #  "All-Cause Death", # intervention
    
    'phecode_438-11',   #  "Abdominal aortic aneurysm",
    'phecode_440-3',#  "Pulmonary embolism", # intervention
    'phecode_413-21',#  "Aortic stenosis", # intervention
    'phecode_413-11', #  "Mitral valve insufficiency",
    'phecode_410-2',#  "Endocarditis",
    'phecode_400',#  "Rheumatic fever and chronic rheumatic heart diseases",	
    
    'phecode_164', #  "Anemia", # intervention
    'phecode_718',  #  "Back pain", # intervention
    'phecode_324-11', #  "Parkinson's disease (Primary)",
    'phecode_705-1', #  "Rheumatoid arthritis", # NEW + interventio
    'phecode_665', #  "Psoriasis", # interesting
    'phecode_284'#  "Suicide ideation and attempt or self harm" # intervention
)

endpoints_common = c(
    'phecode_401', #  "Hypertension", # intervention
    'phecode_202', #  "Diabetes mellitus", # intervention
    'phecode_416-21', #  "Atrial fibrillation", # intervention
    'phecode_468', #  "Pneumonia", # intervention
    'phecode_474', #  "Chronic obstructive pulmonary disease [COPD]", # interventio
    'phecode_583', #  "Chronic kidney disease", # intervention
    
    'phecode_404', #  "Ischemic heart disease",
    'phecode_404-1', #  "Myocardial infarction [Heart attack]", # intervention
    'phecode_431-11', #  "Cerebral infarction [Ischemic stroke]",
    'phecode_424', #  "Heart failure", # intervention
    'phecode_420', #  "Cardiac arrest", # intervention
    'OMOP_4306655' #  "All-Cause Death", # intervention)
    )
    
endpoints_cardio = c(
    'phecode_438-11',   #  "Abdominal aortic aneurysm",
    'phecode_440-3',#  "Pulmonary embolism", # intervention
    'phecode_413-21',#  "Aortic stenosis", # intervention
    'phecode_413-11', #  "Mitral valve insufficiency",
    'phecode_410-2',#  "Endocarditis",
    'phecode_400'#  "Rheumatic fever and chronic rheumatic heart diseases",	
)

endpoints_interesting = c(
    'phecode_164', #  "Anemia", # intervention
    'phecode_718',  #  "Back pain", # intervention
    'phecode_324-11', #  "Parkinson's disease (Primary)",
    'phecode_705-1', #  "Rheumatoid arthritis", # NEW + interventio
    'phecode_665', #  "Psoriasis", # interesting
    'phecode_284'#  "Suicide ideation and attempt or self harm" # intervention
)

endpoint_defs = endpoint_defs %>% 
    mutate(name = phecode_string) %>%
    mutate(name = 
           case_when( 
               phecode_string == "Myocardial infarction [Heart attack]"~"Myocardial infarction",
               phecode_string == "Cerebral infarction [Ischemic stroke]"~"Ischemic stroke",
               phecode_string == "Chronic obstructive pulmonary disease [COPD]"~"Chronic obstructive pulmonary disease",
               phecode_string == "Mitral valve insufficiency"~"Mitral insufficiency",
               phecode_string == "Parkinson's disease (Primary)"~"Parkinson's disease",
               phecode_string == "Suicide ideation and attempt or self harm"~"Suicide attempt",
               phecode_string == "Ischemic heart disease"~"Coronary heart disease",
               phecode_string == "Chronic kidney disease"~"Chronic kidney disease",
               phecode_string == "Rheumatic fever and chronic rheumatic heart diseases"~"Rheumatic heart disease",
               phecode_string == "Abdominal aortic aneurysm"~"Abdominal aortic aneurysm",
                  TRUE ~ name)
           )
            
endpoint_map = endpoint_defs$name
names(endpoint_map) =  endpoint_defs$endpoint
#endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint
endpoint_order = endpoint_selection

In [None]:
endpoints_md %>% filter(endpoint %in% endpoint_selection) %>% as_tibble() %>% arrange(n)  %>%
    mutate(endpoint = recode(endpoint, !!!endpoint_map)) %>% mutate(perc = freq*100)

In [None]:
eligable_eids = arrow::read_feather(glue("{output_path}/eligable_eids_long_220627.feather")) %>% 
    mutate(included = 1)
eligable_eids$eid = as.numeric(levels(eligable_eids$eid))[eligable_eids$eid]

In [None]:
data_outcomes = arrow::read_feather(glue("{output_path}/baseline_outcomes_long_220627.feather", as_data_frame=FALSE)) 

In [None]:
data_records = arrow::read_feather(glue("{output_path}/baseline_records_220627.feather", as_data_frame=FALSE)) 
concepts= fread("/sc-projects/sc-proj-ukb-cvd/data/mapping/athena/CONCEPT.csv")

In [None]:
partitions = 0:1#121

get_endpoint_paths = function(endpoint, features, partitions){
    paths = c()
    for (f in features){
        for (p in partitions){
            temp_path = glue("{experiment_path}/coxph/predictions/{endpoint}_{f}_{p}.feather")
            paths = c(paths, temp_path)
            }
        }
    return(paths)
}

In [None]:
read_endpoint = function(endpoint, features, partitions){
    endpoint_paths = get_endpoint_paths(endpoint, features, partitions)
    endpoint_df = endpoint_paths %>% map_df(~suppressWarnings(read_feather(., col_select=c("eid", "endpoint", "features", "Ft_10")))) 
    endpoint_df = endpoint_df %>% rename(!!(endpoint_df %>% head(1))$endpoint := Ft_10) %>% select(-endpoint) #%>% arrange(features, eid)
    return(endpoint_df)
    }

In [None]:
library(foreach)
my.cluster <- parallel::makeCluster(100, type = "PSOCK")
doParallel::registerDoParallel(cl = my.cluster)
print(my.cluster)

In [None]:
rm(endpoint_dfs)

In [None]:
endpoints_list = endpoints 
endpoint_dfs <- foreach (e = endpoints_list, .final = function(e) setNames(e, endpoints_list)) %do% {
    read_endpoint(e, features, partitions)
    }

In [None]:
parallel::stopCluster(cl = my.cluster)

In [None]:
require(purrr)
require(dplyr)

prediction_df <- endpoint_dfs %>% reduce(left_join, by = c("features", "eid"))

In [None]:
prediction_df %>% write_feather(glue("{experiment_path}/coxph/predictions_wide.feather"))

In [None]:
prediction_df = arrow::read_feather(glue("{experiment_path}/coxph/predictions_wide.feather"))

In [None]:
prediction_df_long = prediction_df %>% pivot_longer(all_of(endpoints), names_to="endpoint", values_to="Ft_10")

In [None]:
prediction_df_features = prediction_df_long %>% pivot_wider(names_from="features", values_from="Ft_10")

In [None]:
prediction_delta = prediction_df_features %>% 
    mutate(delta_abs = `Age+Sex+MedicalHistory` - `Age+Sex`) %>% 
    mutate(delta_rel = delta_abs/`Age+Sex`) %>%
    mutate(prop_rel = delta_rel+1)
           #case_when(delta_rel>0 ~ delta_rel+1, delta_rel<0 ~ delta_rel) 
    #mutate(risk_rel = `Age+Sex+MedicalHistory`/`Age+Sex`) %>%
    #mutate(prop_rel = case_when(risk_rel>=1 ~ risk_rel, risk_rel<1 ~ -1/risk_rel))

In [None]:
prediction_delta %>% write_feather(glue("{experiment_path}/coxph/prediction_deltas_individualchange.feather"))

In [None]:
glue("{experiment_path}/coxph/prediction_deltas_individualchange.feather")

In [None]:
test = prediction_delta %>% sample_n(3000) %>% arrange(delta_abs)

In [None]:
base_size = 8
title_size = 10
facet_size = 8.5
geom_text_size=3
theme_set(theme_classic(base_size = base_size) + 
          theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0), 
                strip.text.x = element_text(size = facet_size),axis.title=element_text(size=10), axis.text=element_text(size=8, color="black"),
                legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))

In [None]:
library(ggallin)

In [None]:
data_covariates = arrow::read_feather(glue("{experiment_path}/data_covariates_full.feather"))

In [None]:
prep_eid_df = function(eid, endpoint_order, category_order){
    
    temp = prediction_delta %>% 
        filter(eid==!!eid) %>% 
        #filter(eid=='XXXXX') %>% 
        left_join(endpoints_md %>% select(endpoint, phecode, phecode_string, phecode_category, sex) %>% as_tibble()) %>%
        mutate(phecode_category = factor(phecode_category, levels=categories_sorted)) %>%
        filter(!phecode_category %in% c('Signs/Symptoms','Preg','Rx','Stat')) %>%
        mutate(endpoint = factor(endpoint, levels=endpoint_order)) %>% ungroup() %>%
        arrange(endpoint) %>% group_by(phecode_category) %>% #mutate(endpoint = row_number()) %>% 
        ungroup() %>%
        mutate(size=case_when(
            (prop_rel>2&prop_rel<5) ~ 0.2, 
            (prop_rel>5&prop_rel<10) ~0.5, 
            prop_rel>10~1, 
            TRUE ~ 0.05)
              )
    
    return(temp)
    }

In [None]:
prep_subsets = function(eid, data_covariates, temp){
    eid_covs = data_covariates %>% filter(eid == !!eid) 
    eid_covs_true = colnames(eid_covs)[eid_covs[1,]==TRUE]
    eid_covs_post = eid_covs %>% select(where(is.factor), where(is.numeric)) %>% 
        select(eid, age, sex, everything())
    
    temp_highpredip = temp %>% filter(prop_rel > 5)
    temp_highestpredip = temp_highpredip %>% arrange(delta_rel) %>% tail(10)
    temp_highrisk = temp %>% filter(`Age+Sex+MedicalHistory` > 0.01)
    temp_highestrisk = temp_highrisk %>% arrange(`Age+Sex+MedicalHistory`) %>% tail(10)
    temp_labels = bind_rows(temp_highestpredip, temp_highestrisk) %>% distinct() %>% 
        mutate(phecode_category = factor(phecode_category, levels=categories_sorted))
    
    temp_excluded = temp %>% filter((sex!="Both")&(sex!=!!eid_covs_post$sex))
    temp_prevalent = temp %>% filter(is.na(prop_rel)) %>% filter((sex=="Both")|(sex==!!eid_covs_post$sex))
    
     mh = (
        temp %>% filter(is.na(delta_abs)) %>% filter((sex==!!eid_covs_post$sex)|sex=="Both") %>% filter(str_detect(phecode, "\\.")) %>% arrange(phecode)
         )$phecode_string
    
    print(glue("eid: {eid}"))
    
    print("General information")
    eid_covs_post %>% print(width=Inf)
    
    print("Medical History and prior records")
    temp_prevalent %>% select(phecode_category, phecode_string) %>% print(n=Inf)
    
    plot_records(eid)
    
    
    print("Highest predisposition for:")
    temp_highpredip %>% select(phecode_category, phecode_string, prop_rel, `Age+Sex+MedicalHistory`) %>% 
        arrange(desc(`Age+Sex+MedicalHistory`)) %>% print(n=Inf)
    
    temp_metadata = c()
    temp_metadata[["highestpredip"]] = temp_highestpredip
    temp_metadata[["highestrisk"]] = temp_highestrisk
    return(temp_metadata)
    }

In [None]:
plot_absolute_risks = function(temp,temp_highestrisk){
    overview = ggplot(temp, aes(x=endpoint, y=`Age+Sex+MedicalHistory`,  color=phecode_category)) + 
    
        labs(x="Endpoints", y="10-year risk (%)") +   
        
        geom_text_repel(data=temp_highestrisk, aes(y=`Age+Sex+MedicalHistory`, label=str_wrap(glue("{phecode_string} ({round(`Age+Sex+MedicalHistory`*100, 1)}%)"), 30)), 
                  hjust=0.5, size=3, ylim=c(0.5, 1.2), segment.size=0.2, segment.color="black", segment.alpha=0.7, max.overlaps=Inf)+
        geom_point(aes(alpha=delta_rel, size=factor(size)))+
        geom_segment(aes(xend=endpoint, yend=0, alpha=delta_rel))+
        #facet_grid(~phecode_category, scales="free_x", space="free_x")+
        
        scale_size_manual(values=c(0.1, 0.5, 1, 2))+
        scale_color_manual(values=category_fill_map)+
        #scale_color_gradient2(low="blue", mid="white", high="red", midpoint=1, limits=c(-0.8, 5), oob = scales::squish)+
        
        scale_x_discrete(expand=expansion(add=20))+
        scale_y_continuous(expand=c(0, 0), labels=scales::percent) + #breaks = c(log(0.1), log(1), log(10)), labels=c(0.1, 1, 10))+

        theme(axis.title.x=element_blank(),
              axis.text.x=element_blank(),
              axis.ticks.x=element_blank(),
              panel.grid.major.y=element_line(), 
              strip.text = element_text(angle=270, hjust=1),
              legend.position="none",
              plot.margin = margin(0.1, 0, 0.1, 0, "cm")
             ) +
    
        coord_cartesian(ylim=c(0, 1), clip = "off")
    return(overview)
    }

In [None]:
plot_predispositions = function(temp, temp_highestpredip){
    overview_rel = ggplot(temp, aes(x=endpoint, y=delta_rel, color=phecode_category)) + 
    
        labs(x="Endpoints", y="Predisposition") +
 
        #geom_hline(yintercept=0, color="black", size=0.5, alpha=1)+
    
        #geom_col()+
        #geom_segment(data=temp, aes(x=endpoint, xend=endpoint, y=0, yend=1, fill=phecode_category, alpha=.2), color=NA)+
        geom_text_repel(data=temp_highestpredip, aes(y=delta_rel+1, label=str_wrap(glue("{phecode_string} (x{round(delta_rel+1, 1)})"), 30)), 
                  hjust=0.5, size=3, ylim=c(1.5, 1000), segment.size=0.2, segment.color="black", segment.alpha=0.7, max.overlaps=Inf)+
        geom_point(aes(alpha=delta_rel, size=factor(size)))+
        geom_segment(aes(xend=endpoint, yend=0, alpha=delta_rel))+
        #facet_grid(~phecode_category, scales="free_x", space="free_x")+

        scale_size_manual(values=c(0.1, 0.5, 1, 2))+
        #scale_color_brewer(type = "qual")+#
        scale_color_manual(values=category_fill_map)+
        scale_fill_manual(values=category_fill_map)+
        #scale_color_gradient2(low="blue", mid="white", high="red", midpoint=0, limits=c(-0.8, 5), oob = scales::squish)+


        scale_x_discrete(expand=expansion(add=20))+
        #scale_y_continuous(expand=c(0, 0), trans = "rev_pseudolog")+
        scale_y_continuous(expand=c(0, 0), 
                           trans=pseudolog10_trans, 
                           breaks=c(-1, 0, 1, 11, 101, 1001),
                           labels=c("Reduced risk", "No change", "2x Risk", "10x Risk","100x Risk", "1000x Risk"))+
                           #breaks=c(0, 1, 2, 5, 10, 20, 50, 100, 1000),
                           #labels=c("0x", "No change", "2x", "5x", "10x", "20x", "50x", "100x", "1000x"))+

        coord_cartesian(ylim=c(1001, 0))+

        theme(strip.background = element_blank(),
              strip.text.x = element_blank(),
              axis.title.x=element_blank(),
              axis.text.x=element_blank(),
              axis.ticks.x=element_blank(),
              panel.grid.major.y=element_line(), 
              strip.text = element_text(angle=270, hjust=1),
              legend.position="none",
              plot.margin = margin(0.1, 0, 0.1, 0, "cm")
         )
    return(overview_rel)
    }

In [None]:
plot_records = function(eid){
    eid_records = t(data_records %>% filter(eid==!!eid)) %>% 
        as_tibble(rownames=NA) %>% filter(V1==1) %>% rownames_to_column(var="concept_id_raw") %>% 
        filter(str_detect(concept_id_raw, "OMOP_")) %>% mutate(concept_id = as.numeric(str_replace_all(concept_id_raw, "OMOP_", ""))) %>% 
        left_join(concepts)
    print(eid_records$concept_name)
    }

In [None]:
# 1585902 # alright sample

In [None]:
library(ggtext)
library(ggrepel)
library(viridis)

plot_width = 10; plot_height=5; plot_res = 320
options(repr.plot.width = plot_width, repr.plot.height = plot_height, repr.plot.res=plot_res)

categories_sorted = (endpoints_md %>% arrange(endpoint) %>% distinct(phecode_category) %>% as_tibble())$phecode_category

category_color_map <- c(viridis::turbo(n = length(categories_sorted)))
names(category_color_map) = categories_sorted

category_fill_map <- rep_len(c("black", "gray20"), length(categories_sorted))
names(category_fill_map) = categories_sorted

endpoint_order = (endpoints_md)$endpoint
eid_selection = unique(prediction_delta$eid) %>% sample(1)

for (eid in eid_selection){
    
    temp = prep_eid_df(eid, endpoint_order, category_order)
    
    temp_subsets = prep_subsets(eid, data_covariates, temp)

    overview_abs = plot_absolute_risks(temp, temp_subsets[["highestrisk"]])
    overview_rel = plot_predispositions(temp, temp_subsets[["highestpredip"]])
    
    ov_individual = overview_abs / overview_rel + plot_annotation(title=glue("{eid}"))
    
    print(ov_individual)                                      
    flush.console()
    }

In [None]:
eid_matrix = t(eid_records) 
concept_id = eid_matrix[1]

In [None]:
eid_matrix %>% mutate(concept_id=concept_id)

In [None]:
temp_subsets[["highestrisk"]]

In [None]:
temp_subsets[["highestpredip"]]

In [None]:
temp

In [None]:
temp_highestrisk

In [None]:
plot_width = 5; plot_height=2.5; plot_res = 320
options(repr.plot.width = plot_width, repr.plot.height = plot_height, repr.plot.res=plot_res)
ov_individual

In [None]:
temp

In [None]:
mh 

In [None]:
library(plotly)
f1 = ggplotly(overview)
f2 = ggplotly(overview_rel)
f = subplot(f1, f2, nrows=2)

In [None]:
htmlwidgets::saveWidget(f, "ind_test.html")

In [None]:
t / overview_rel

In [None]:
plot_width = 10; plot_height=2.5; plot_res = 320
options(repr.plot.width = plot_width, repr.plot.height = plot_height, repr.plot.res=plot_res)

library(ggtext)
categories_sorted = (endpoints_md %>% arrange(endpoint) %>% distinct(phecode_category) %>% as_tibble())$phecode_category

temp = prediction_delta %>% 
    #filter(eid=='XXXXX') %>% 
    filter(eid=='XXXXX') %>% 
    left_join(endpoints_md %>% select(endpoint, phecode_string, phecode_category) %>% as_tibble()) %>%
    mutate(phecode_category = factor(phecode_category, levels=categories_sorted)) #%>%
    #mutate(highlight = case_when(endpoint %in% endpoint_selection ~ "YES", TRUE ~ "NO")) 


endpoint_order = (endpoints_md)$endpoint

temp = temp %>% mutate(endpoint = factor(endpoint, levels=endpoint_order)) %>% ungroup() %>% 
    arrange(endpoint) %>% group_by(phecode_category) %>% mutate(endpoint = row_number()) 

overview = ggplot(temp) +
    #geom_ribbon(aes(x=endpoint, ymin=0, ymax=logh), fill="black", alpha=0.2)+
    geom_point(aes(x=endpoint, y=`Age+Sex+MedicalHistory`)+#, color=highlight, size=highlight, alpha=highlight)) +
    #geom_text(data=temp %>% filter(highlight=="YES"), aes(x=endpoint, y=log_ratio+0.5, label="↓", vjust=0), color="black", size=5, alpha=0.7) +
    #geom_segment(aes(x=endpoint, xend=endpoint, y=0, yend=delta, color=highlight, size=highlight), alpha=0.5)+#+
    labs(x="Endpoints", y="Rate Ratio") +# (Top vs. Bottom 10%)")+
    #scale_color_manual(values=c("NO"="black", "YES"="firebrick"))+
    #scale_alpha_manual(values=c("NO"=0.1, "YES"=1))+
    #scale_size_manual(values=c("NO"=0.01, "YES"=1))+
    scale_y_continuous(breaks = c(log(0.1), log(1), log(10)), labels=c(0.1, 1, 10))+
    scale_x_discrete(expand=expansion(add=20))+
    facet_grid(~phecode_category, scales="free_x", space="free_x")+#, switch=TRUE)+
    #facet_grid2(~phecode_category, scales = "free", independent = "all") + 
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        panel.grid.major.y=element_line(), 
         strip.text = element_text(angle=270, hjust=0)) + 
    theme(legend.position="none") 
    
    #geom_ribbon(aes(x=id, ymin=AgeSex, ymax=`Age+Sex+MedicalHistory`), fill="red", alpha=0.2)
#geom_violin(size=0.1)
overview