# Benchmarks

## Initialize

In [None]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = 230425
experiment_path = glue("{output_path}/{experiment}")

In [None]:
base_size = 8
title_size = 10
facet_size = 10
geom_text_size=3
theme_set(theme_classic(base_size = base_size) + 
          theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0), 
                strip.text.x = element_text(size = facet_size),axis.title=element_text(size=10), axis.text=element_text(size=8, color="black"),
                legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2), panel.grid.major=element_line()))

In [None]:
colors_dict = read_json("colors.json")
color_map <- c(
    "Identity(AgeSex)+MLP" = colors_dict$pastel$red$mid,
    "Identity(Records)+MLP" = colors_dict$pastel$red$mid,
    "GNN(Records)+MLP" = colors_dict$pastel$red$mid,
    "Identity(AgeSex+Records)+MLP" = colors_dict$pastel$red$mid,
    "GNN(AgeSex+Records)+MLP" = colors_dict$pastel$red$mid
)

In [None]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [None]:
endpoint_map = endpoint_defs$phecode_string
names(endpoint_map) =  endpoint_defs$endpoint
#endpoint_order_freq = (outcome_freq %>% arrange(desc(freq)))$endpoint

In [None]:
endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint

In [None]:
endpoint_selection = c(
'phecode_164',
'phecode_179',
'phecode_202-2',
'phecode_284',
'phecode_401',
'phecode_404-1',
'phecode_413-21',
'phecode_413-32',
'phecode_416-21',
'phecode_420',
'phecode_424',
'phecode_431-1',
'phecode_438-6',
'phecode_440-3',
'phecode_444-13',
'phecode_448',
'phecode_468',
'phecode_474',
'phecode_542-4',
'phecode_583',
'phecode_665-1',
'phecode_705-1',
'phecode_718',
'OMOP_4306655'
    )

## Load data

# Load Benchmarks

In [None]:
name = "hrs_endpoints"
hrs_endpoints = arrow::read_feather(glue("{experiment_path}/{name}.feather"), col_select=c("endpoint", "score", "partition", "hrs_mh", "hrs_mh_age", "hrs_mh_sex")) %>% left_join(endpoint_defs) 

In [None]:
hrs_endpoints

In [None]:
hrs = hrs_endpoints %>% 
    group_by(endpoint, score, phecode_string, phecode_category) %>% 
    #filter(score %in% c("MedicalHistory", "Age+Sex+MedicalHistory", "Age+Sex+MedicalHistory+I(Age*MH)")) %>%
    #mutate(endpoint = factor(endpoint, levels=endpoints_sorted)) %>%
    #mutate(phecode_category = factor(phecode_category, levels=pcats_sorted)) %>%
    select(endpoint, score, hrs_mh, hrs_mh_age, hrs_mh_sex, phecode_string, phecode_category) %>%
    ungroup()
hrs

In [None]:
unique(hrs$score)

In [None]:
purrr::map_dfr(integer(100), ~ hrs %>% group_by(endpoint, score) %>% sample_frac(1, replace=TRUE), .id="i") -> temp_hrs

In [None]:
library(ggdist)

In [None]:
temp_hrs_mh = temp_hrs %>% group_by(endpoint, score, phecode_string, phecode_category) %>% median_qi(hrs_mh, hrs_mh_age, hrs_mh_sex, na.rm=TRUE)

In [None]:
temp_hrs_linear = temp_hrs %>% ungroup() %>%
    filter(score %in% c("MedicalHistory", "Age+Sex+MedicalHistory", "Age+Sex+Comorbidities+MedicalHistory", "ASCVD+MedicalHistory")) %>%
    group_by(endpoint, score, phecode_string, phecode_category) %>% 
    median_qi(`hrs_mh`, na.rm=TRUE) %>% rename(hr = `hrs_mh`) %>%
    mutate(hr = round(hr, 2), .lower=round(.lower, 2), .upper=round(.upper, 2)) %>%#c(hr, .lower, .upper), round, 1) %>%
    mutate(HR_MedicalHistory = glue("{hr} ({.lower}, {.upper})")) %>% 
    select(endpoint, score, phecode_string, phecode_category, HR_MedicalHistory) %>%
    pivot_wider(names_from="score", values_from="HR_MedicalHistory") %>% 
    select(all_of(c("phecode_category", "endpoint", "phecode_string", "MedicalHistory", 'Age+Sex+MedicalHistory',  "Age+Sex+Comorbidities+MedicalHistory", "ASCVD+MedicalHistory"))) %>% 
    mutate(endpoint = factor(endpoint, levels = endpoint_order)) %>% 
    arrange(endpoint) %>%
    #mutate(endpoint = recode(endpoint, !!!endpoint_map)) %>%
    ungroup()
    
temp_hrs_linear

In [None]:
temp_hrs_linear %>% arrange(as.character(endpoint))

In [None]:
temp_hrs_linear  %>% 
    write_csv("outputs/SupplTable6_HRsAll.csv")