# 2. Exploration

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(glue)
library(cowplot)
library(survminer)
library(survival)
library(ggsci)
library(arsenal)
library(yaml)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

dataset_name = "210714_metabolomics"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = glue("{base_path}/data")
dataset_path = glue("{data_path}/3_datasets_post/{dataset_name}")

project_label="21_metabolomics_multitask"
project_path = glue("{base_path}/results/projects/{project_label}")
figures_path = glue("{project_path}/figures")
data_results_path = glue("{project_path}/data")

figure_path = glue("{figures_path}/Figure1")

In [None]:
library(data.table)
metabolite_map = fread("ExternalCohorts/metabolite_map.csv", header=TRUE)

## UKB

In [None]:
data = arrow::read_feather(glue("{dataset_path}/data_merged.feather")) 

In [None]:
ukb_abbreviation_map = metabolite_map$abbreviation
names(ukb_abbreviation_map) = metabolite_map$UKB_names
nmr_ukb = data %>% filter(NMR_FLAG==TRUE) %>% mutate(id = as.character(eid)) %>% select(id, all_of(metabolite_map$UKB_names)) %>%
    pivot_longer(all_of(metabolite_map$UKB_names), names_to="metabolite", values_to="value") %>% 
    mutate(metabolite = recode(metabolite, !!!ukb_abbreviation_map)) %>% 
    pivot_wider(names_from="metabolite", values_from="value") %>%
    mutate(biobank = "UKB")
nmr_ukb

## WHII

In [None]:
library("readxl")

In [None]:
metabolite_map_whii = read_excel("ExternalCohorts/csv-excel-biomarker names.xlsx")

In [None]:
metabolite_map_units = metabolite_map %>% left_join(metabolite_map_whii %>% mutate(WHII_name = `CSV column name`) %>% select(WHII_name, `Biomarker name`, Unit), by=c("name"="Biomarker name")) %>% filter(WHII_name==WHII_name)

In [None]:
nmr_whii_raw = fread("/sc-projects/sc-proj-ukb-cvd/data/3_datasets_post/210714_metabolomics/review_round1/whii_metabolites_anonymized.csv") %>% mutate(id=row_number()) #%>% select(id, all_of(metabolite_map$abbreviations))
nmr_whii_raw %>% head()

In [None]:
whii_abbreviation_map = metabolite_map$abbreviation
names(whii_abbreviation_map) = metabolite_map$machine_readable_name

In [None]:
nmr_whii = nmr_whii_raw  %>% mutate(id = as.character(V1)) %>%
    select(id, all_of(metabolite_map$machine_readable_name))  %>%
    pivot_longer(all_of(metabolite_map$machine_readable_name), names_to="metabolite", values_to="value") %>% 
    mutate(metabolite = recode(metabolite, !!!whii_abbreviation_map)) %>% 
    pivot_wider(names_from="metabolite", values_from="value") %>%
    mutate(biobank = "WHII")
nmr_whii %>% head()

## BBMRI

In [None]:
nmr_bbmri_raw = readRDS("/home/steinfej/code/MetabolomicsCommonDiseases/2_analysis/ExternalCohorts/metabolites_summary_statistics_220408.RData")$inputs$data

In [None]:
nmr_bbmri_raw$id <- rownames(nmr_bbmri_raw)

In [None]:
nmr_bbmri_raw %>% group_by(biobank) %>% tally()

In [None]:
bbmri_abbreviation_map = metabolite_map$abbreviation
names(bbmri_abbreviation_map) = metabolite_map$BBMRI_names
nmr_bbmri = nmr_bbmri_raw %>% select(id, everything()) %>% 
    pivot_longer(all_of(metabolite_map$BBMRI_names), names_to="metabolite", values_to="value") %>% 
    mutate(metabolite = recode(metabolite, !!!bbmri_abbreviation_map)) %>% 
    pivot_wider(names_from="metabolite", values_from="value")

### All cohorts

In [None]:
nmr_all = bind_rows(nmr_ukb, nmr_bbmri, nmr_whii)

In [None]:
nmr_all_calc = nmr_all %>% pivot_longer(all_of(metabolite_map$abbreviation), names_to="metabolite", values_to="values") %>% 
    group_by(biobank, metabolite) %>% 
    summarise(n=n(), x = quantile(values, c(0.25, 0.5, 0.75), na.rm=TRUE), q = c("Q25", "Median", "Q75")) %>%
    #pivot_wider(names_from="metabolite", values_from="x") %>%
    pivot_wider(names_from="q", values_from="x") %>%
    mutate(IQR = glue("{Median} ({Q25}, {Q75})")) %>%
    mutate(bb_label = glue("{biobank} (n={n})")) %>%
    ungroup() %>%
    select(bb_label, metabolite, `Median`, `Q25`, `Q75`) %>%
    pivot_wider(names_from="bb_label", values_from=c("Median", "Q25", "Q75"), names_glue="{bb_label}_{.value}") %>%
    select(sort(current_vars())) %>% select(metabolite, everything()) %>% arrange(metabolite) %>%
    left_join(metabolite_map_whii %>% select(`Excel column name`, Unit), by=c("metabolite"="Excel column name")) %>%
    mutate(label = glue("{metabolite} ({Unit})")) %>% select(-metabolite, -Unit) %>%
    select(label, everything())
    #select(metabolite, `UKB (n=117981)`, `RS (n=2949)`, `LLS_PARTOFFS (n=2313)`, `PROSPER (n=960)`)

In [None]:
nmr_all_calc %>% write_csv("outputs/nmr_dist.csv")

In [None]:
library(gtsummary)

In [None]:
nmr_table = nmr_all %>% 
    tbl_summary(by="biobank", missing="no")  %>%
    bold_labels()

In [None]:
library(gt)
plot_name = "nmr_dist"
nmr_table %>% as_gt() %>% 
    tab_header(title = "NMR metabolites") %>% 
    gt::gtsave(glue("outputs/{plot_name}.html"))

# Observation time

In [None]:
plot_width=10; plot_height=3; plot_dpi=300
options(repr.plot.width=plot_width, repr.plot.height=plot_height)

theme_set(theme_classic(base_size = base_size) + 
    theme(strip.background = element_blank(), plot.title=element_text(size=25, hjust=0), 
          strip.text.x = element_text(size = facet_size)) + theme(legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))


In [None]:
temp = data %>% filter(MACE_event==0)  %>% select(c(eid, MACE_event_time))
mean = round((temp %>% summarise(mean=median(MACE_event_time)))$mean, 1)
obs_time = ggplot(temp, aes(x=MACE_event_time)) + ggtitle("Observation Time") + 
    geom_density(fill="gray70", alpha=0.5) +
    labs(x="Years", y="Density") +
    geom_vline(aes(xintercept=mean(MACE_event_time)),color="black", linetype="dashed", size=1)+
    #geom_text(x=mean, label=mean, y=0.15, hjust=-0.5)+
    #ylab("Prevalence in [%]") +
    scale_x_continuous(expand=c(0,0))+
    scale_y_continuous(expand=c(0,0))+
    coord_cartesian(xlim=c(0, 15))+
    annotate("text", x=mean-5, y=0.2, label=paste0("Median: ", mean, " years"), size = geom_text_size)
obs_time

# Aggregated person years

In [None]:
py_data = data %>% select("eid", all_of(paste0(names(endpoint_map), "_event_time"))) %>% pivot_longer(-eid, names_to="endpoint", values_to="person_years") %>% mutate(endpoint = str_remove_all(endpoint, "_event"))

In [None]:
py_data %>% group_by(eid) %>% arrange(desc(person_years)) %>% slice(1) %>% ungroup() %>% summarise(sum_years=sum(person_years))

# Endpoint frequencies

In [None]:
library(ggthemes)
endpoint_map = c(
    'M_MACE'='MACE',
    'M_all_cause_dementia'='Dementia',
    'M_type_2_diabetes'='T2 Diabetes',
    'M_liver_disease'='Liver Disease',
    'M_renal_disease'='Renal Disease',
    'M_atrial_fibrillation'='Atrial Fibrillation',
    'M_heart_failure'= 'Heart Failure',
    'M_coronary_heart_disease'='CHD',
    'M_venous_thrombosis'='Ven. Thrombosis',
    'M_cerebral_stroke'='Cerebral Stroke',
    'M_abdominal_aortic_aneurysm'='AAA',
    'M_peripheral_arterial_disease'='PAD',
    "M_chronic_obstructuve_pulmonary_disease" = "COPD",
    "M_asthma" = "Asthma",
    'M_parkinsons_disease' = "Parkinson's",    
    "M_lung_cancer" = "Lung Cancer",
    "M_non_melanoma_skin_cancer" = "Skin Cancer",
    "M_colon_cancer"= "Colon Cancer",
    "M_rectal_cancer" = "Rectal Cancer",
    "M_prostate_cancer"= "Prostate Cancer",
    "M_breast_cancer" = "Breast Cancer",
    'M_cataracts' = "Cataracts", 
    'M_glaucoma' = "Glaucoma",
    'M_fractures' = "Fractures"
)

endpoint_order = c("M_MACE", "M_coronary_heart_disease", "M_cerebral_stroke", "M_all_cause_dementia", "M_heart_failure", "M_atrial_fibrillation",
                   "M_type_2_diabetes", "M_liver_disease", "M_renal_disease", "M_peripheral_arterial_disease", "M_venous_thrombosis",  "M_abdominal_aortic_aneurysm",
                   "M_chronic_obstructuve_pulmonary_disease", "M_asthma", 'M_parkinsons_disease', 'M_cataracts', 'M_glaucoma', 'M_fractures',
                    "M_lung_cancer","M_non_melanoma_skin_cancer","M_colon_cancer","M_rectal_cancer","M_prostate_cancer","M_breast_cancer"
                   
)

In [None]:
temp_endpoints = data %>% select(starts_with("M_")) %>% select(ends_with("_event"), -contains("-time"), -contains("comp")) %>% 
    pivot_longer(everything()) %>% group_by(name) %>% summarise(frequency=mean(value)) %>% arrange(frequency) %>% ungroup() %>% as.data.frame() %>%
    mutate(name = str_remove_all(name, "_event")) %>% filter(name %in% endpoints) %>% mutate(name = factor(name, levels=endpoint_order))

In [None]:
inc_data = data %>% select("eid", all_of(paste0(names(endpoint_map), "_event"))) %>% pivot_longer(-eid, names_to="endpoint", values_to="Incident") %>% mutate(endpoint = str_remove_all(endpoint, "_event"))
prev_data = data %>% select(eid, all_of(names(endpoint_map))) %>% pivot_longer(-eid, names_to="endpoint", values_to="Prevalent") %>% mutate(Prevalent = as.integer(Prevalent))
agg_data = prev_data %>% left_join(inc_data, by=c("eid", "endpoint")) 
agg_data$endpoint = recode(agg_data$endpoint, !!!endpoint_map)
agg_data %>% head()

In [None]:
agg_data %>% filter(Prevalent==0) %>% group_by(endpoint) %>% summarise(frequency=sum(Incident)/n()) %>% arrange(frequency) %>% mutate(f = round(frequency*100, 2)) # %>% mutate(sum=n()) %>% group_by(frequency, sum) %>% tally() %>% mutate(freq=n*100/sum)