# 2. Exploration

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(glue)
library(cowplot)
library(survminer)
library(survival)
library(ggsci)
library(arsenal)
library(yaml)

#setwd("/")
#path = "/home/steinfej/projects/uk_biobank/"
#dataset_path = "data/datasets/cvd_massive_excl_emb_ind"

In [None]:
base_path = "/home/jakobs"

project_path = glue("{base_path}/data")
experiment = '230323'
experiment_path = glue("{project_path}/{experiment}")

## Load data

In [None]:
endpoint_defs = arrow::read_feather(glue("{base_path}/data/endpoints_epic_md.feather"))
endpoints = endpoint_defs$endpoint

In [None]:
data_outcomes = arrow::read_feather(glue("{base_path}/data/data_outcomes_long_230320.feather"))

In [None]:
data_outcomes %>% head()

In [None]:
data_outcomes = data_outcomes %>% 
        mutate(prev = as.integer(prev)) %>% 
        mutate(event = as.integer(event))

In [None]:
data_outcomes_agg = data_outcomes %>% filter(prev==0) %>% group_by(endpoint) %>% summarise(ratio = mean(event))

In [None]:
table_data = arrow::read_feather(glue("{experiment_path}/data_covariates_full.feather")) %>% select(-eid)

In [None]:
library(gtsummary)
table1 <- table_data %>% select(age, sex, smoking_status, systolic_blood_pressure, cholesterol, hdl_cholesterol) %>%
  # alcohol_intake_frequency = case_when(
  #   alcohol_intake_frequency == "Daily or almost daily" ~ 1,
  #   TRUE ~ 0
  # )
   
  tbl_summary(by = "sex", 
              label = list(
                age = "Age at Recruitment",
                smoking_status = "Smoker status",
                # alcohol_intake_frequency = "Daily Alcohol Intake",
                #bmi = "BMI",
                systolic_blood_pressure = "Systolic Blood Pressure (mmHg)",
                cholesterol = "Total Cholesterol (mmol/L)",
                hdl_cholesterol = "HDL Cholesterol (mmol/L)",
                diabetes = "Diabetes mellitus"
              ),
              missing = "no") %>% 
  add_overall(last = TRUE) %>% 
  bold_labels()

In [None]:
library(gt)
plot_name = "SupplTable8_EPIC_Table1"
table1 %>% as_gt() %>% 
    #opt_align_table_header(align = "left") %>% 
    gt::gtsave(glue("outputs/{plot_name}.html"))
#table1 %>% as_gt()  %>% tab_header(title = "Table 1") %>% 
#    #opt_align_table_header(align = "left") %>%
#    gt::gtsave(glue("{figure_path}/{plot_name}.html"))

In [None]:
table1 %>% as_gt()

# BASELINE

In [None]:
options(repr.plot.width=20, repr.plot.height=8)

# OBSERVATION TIME

In [None]:
base_size = 25
title_size = 35
facet_size = 25
geom_text_size=6
library(ggplot2); 

In [None]:
plot_width=10; plot_height=3; plot_dpi=300
options(repr.plot.width=plot_width, repr.plot.height=plot_height)

theme_set(theme_classic(base_size = base_size) + 
    theme(strip.background = element_blank(), plot.title=element_text(size=25, hjust=0), 
          #axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank(),
          #axis.title.x = element_blank(), axis.text.x = element_blank(), #axis.ticks.y = element_blank(), axis.line.y = element_blank(), 
          strip.text.x = element_text(size = facet_size)) + theme(legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))

#expand = c(0.01, 0.8)

In [None]:
temp = data %>% filter(MACE_event==0)  %>% select(c(eid, MACE_event_time))
mean = round((temp %>% summarise(mean=median(MACE_event_time)))$mean, 1)
obs_time = ggplot(temp, aes(x=MACE_event_time)) + ggtitle("Observation Time") + 
    geom_density(fill="gray70", alpha=0.5) +
    labs(x="Years", y="Density") +
    geom_vline(aes(xintercept=mean(MACE_event_time)),color="black", linetype="dashed", size=1)+
    #geom_text(x=mean, label=mean, y=0.15, hjust=-0.5)+
    #ylab("Prevalence in [%]") +
    scale_x_continuous(expand=c(0,0))+
    scale_y_continuous(expand=c(0,0))+
    coord_cartesian(xlim=c(0, 15))+
    annotate("text", x=mean-5, y=0.2, label=paste0("Median: ", mean, " years"), size = geom_text_size)
obs_time

# ENDPOINTS

## Kaplan Meyer estimates

In [None]:
library(ggthemes)
endpoint_map = c(
    'M_MACE'='MACE',
    'M_all_cause_dementia'='Dementia',
    'M_type_2_diabetes'='T2 Diabetes',
    'M_liver_disease'='Liver Disease',
    'M_renal_disease'='Renal Disease',
    'M_atrial_fibrillation'='Atrial Fibrillation',
    'M_heart_failure'= 'Heart Failure',
    'M_coronary_heart_disease'='CHD',
    'M_venous_thrombosis'='Ven. Thrombosis',
    'M_cerebral_stroke'='Cerebral Stroke',
    'M_abdominal_aortic_aneurysm'='AAA',
    'M_peripheral_arterial_disease'='PAD',
    "M_chronic_obstructuve_pulmonary_disease" = "COPD",
    "M_asthma" = "Asthma",
    'M_parkinsons_disease' = "Parkinson's",    
    "M_lung_cancer" = "Lung Cancer",
    "M_non_melanoma_skin_cancer" = "Skin Cancer",
    "M_colon_cancer"= "Colon Cancer",
    "M_rectal_cancer" = "Rectal Cancer",
    "M_prostate_cancer"= "Prostate Cancer",
    "M_breast_cancer" = "Breast Cancer",
    'M_cataracts' = "Cataracts", 
    'M_glaucoma' = "Glaucoma",
    'M_fractures' = "Fractures"
)

endpoint_order = c("M_MACE", "M_coronary_heart_disease", "M_cerebral_stroke", "M_all_cause_dementia", "M_heart_failure", "M_atrial_fibrillation",
                   "M_type_2_diabetes", "M_liver_disease", "M_renal_disease", "M_peripheral_arterial_disease", "M_venous_thrombosis",  "M_abdominal_aortic_aneurysm",
                   "M_chronic_obstructuve_pulmonary_disease", "M_asthma", 'M_parkinsons_disease', 'M_cataracts', 'M_glaucoma', 'M_fractures',
                    "M_lung_cancer","M_non_melanoma_skin_cancer","M_colon_cancer","M_rectal_cancer","M_prostate_cancer","M_breast_cancer"
                   
)

In [None]:
temp_endpoints = data %>% select(starts_with("M_")) %>% select(ends_with("_event"), -contains("-time"), -contains("comp")) %>% 
    pivot_longer(everything()) %>% group_by(name) %>% summarise(frequency=mean(value)) %>% arrange(frequency) %>% ungroup() %>% as.data.frame() %>%
    mutate(name = str_remove_all(name, "_event")) %>% filter(name %in% endpoints) %>% mutate(name = factor(name, levels=endpoint_order))

In [None]:
inc_data = data %>% select("eid", all_of(paste0(names(endpoint_map), "_event"))) %>% pivot_longer(-eid, names_to="endpoint", values_to="Incident") %>% mutate(endpoint = str_remove_all(endpoint, "_event"))
prev_data = data %>% select(eid, all_of(names(endpoint_map))) %>% pivot_longer(-eid, names_to="endpoint", values_to="Prevalent") %>% mutate(Prevalent = as.integer(Prevalent))
agg_data = prev_data %>% left_join(inc_data, by=c("eid", "endpoint")) 
agg_data$endpoint = recode(agg_data$endpoint, !!!endpoint_map)
agg_data %>% head()

In [None]:
agg_data %>% filter(Prevalent==0) %>% group_by(endpoint) %>% summarise(frequency=sum(Incident)/n()) %>% arrange(frequency) %>% mutate(f = round(frequency*100, 2)) # %>% mutate(sum=n()) %>% group_by(frequency, sum) %>% tally() %>% mutate(freq=n*100/sum)

In [None]:
agg_data %>% filter(Prevalent==0) %>% group_by(eid) %>% summarise(frequency=sum(Incident)) %>% mutate(sum=n()) %>% group_by(frequency, sum) %>% tally() %>% mutate(freq=n*100/sum)

In [None]:
117981-67735

In [None]:
100-57.411786644

# person years

In [None]:
tmp = data_outcomes %>% group_by(eid) %>% select(eid, time) %>% arrange(desc(time)) %>% slice(1) %>% ungroup()# %>% summarise(sum_years=sum(time))

In [None]:
tmp %>% filter(eid %in% eids_with_retina$retina_eids) %>% summarise(sum_years=sum(time))

In [None]:
tmp %>% filter(eid %in% eids_with_retina$retina_eids) %>% summarise(sum_years=median(time))

In [None]:
py_data = data %>% select("eid", all_of(paste0(names(endpoint_map), "_event_time"))) %>% pivot_longer(-eid, names_to="endpoint", values_to="person_years") %>% mutate(endpoint = str_remove_all(endpoint, "_event"))

In [None]:
py_data %>% group_by(eid) %>% arrange(desc(person_years)) %>% slice(1) %>% ungroup() %>% summarise(sum_years=sum(person_years))

In [None]:
agg_data %>% filter(Prevalent==0) %>% group_by(endpoint) %>% summarise(frequency=mean(Incident)) %>% arrange(frequency) %>% ungroup()

In [None]:
library(ggrepel)

In [None]:
temp_endpoints %>% mutate(label = recode(name, !!!endpoint_map))

In [None]:
plot_width=10; plot_height=12; plot_dpi=320
options(repr.plot.width=plot_width, repr.plot.height=plot_height)
endpoint_freq = ggplot(temp_endpoints %>% mutate(label = recode(name, !!!endpoint_map))) + ggtitle("Endpoint Frequency") + 
    #geom_density(fill="gray70", alpha=0.5) +
    labs(x="Endpoint", y="Frequency") +
    geom_bar(stat="identity", aes(x=fct_rev(fct_reorder(name, desc(frequency))), y=frequency), size=1, alpha=0.7) + 
    coord_flip(ylim=c(0, 0.16))+
    theme(legend.position="none")+
    scale_x_discrete(labels = endpoint_map) + scale_y_continuous(expand=c(0, 0), labels=scales::percent) + 
    geom_text(aes(x=name, y=frequency+0.003, label=stringr::str_wrap(label, 20)), size=7, hjust=0)+
     theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())
    #geom_text(x=mean, label=mean, y=0.15, hjust=-0.5)+
    #ylab("Prevalence in [%]") +
   # scale_x_continuous(expand=c(0,0))+
    #scale_y_continuous(expand=c(0,0))+

    #coord_cartesian(xlim=c(0, 15))+
endpoint_freq

In [None]:
library(patchwork)
theme_set(theme_classic(base_size = base_size) + 
    theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0),
          #axis.title.y = element_blank(), 
          axis.text.y = element_text(size=base_size), #axis.ticks.y = element_blank(), axis.line.y = element_blank(),
          #axis.title.x = element_blank(), 
          axis.text.x = element_text(size=base_size), #axis.ticks.x = element_blank(), axis.line.x = element_blank(), 
          strip.text.x = element_text(size = facet_size)) + theme(legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))

In [None]:
flowchart <- png::readPNG("UKB_Flowchart1.png", native = TRUE)

In [None]:
options(repr.plot.width=plot_width, repr.plot.height=plot_height)
right = #(#(obs_time + theme(plot.title=element_text(size=25, hjust=0.5))) / 
        (endpoint_freq + theme(plot.title=element_text(size=25, hjust=0.5)))#) + 
        #plot_layout(heights = c(2, 10))

In [None]:
library(patchwork)

In [None]:
plot_width=20; plot_height=12; plot_dpi=320
options(repr.plot.width=plot_width, repr.plot.height=plot_height)

fig1 = (plot_spacer() | right)#+
    #plot_annotation(title = "Figure 1: Participant Selection and Subcohort Characteristics") + 
    #plot_annotation(subtitle = "~11.1 years average observation window; 21664 (~5.4%) major adverse cardiac events (MACE) in first 10 years")
    #plot_annotation(tag_levels = "A")
fig1

In [None]:
plot_name = "Figure1"
ggsave(filename=glue("{figure_path}/{plot_name}.pdf"), plot=fig1, width=plot_width, height=plot_height, dpi=plot_dpi, device="pdf")