# 2. Exploration

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(glue)
library(cowplot)
library(survminer)
library(survival)
library(ggsci)
library(arsenal)
library(yaml)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

dataset_name = "210714_metabolomics"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = glue("{base_path}/data")
dataset_path = glue("{data_path}/3_datasets_post/{dataset_name}")

project_label="21_metabolomics_multitask"
project_path = glue("{base_path}/results/projects/{project_label}")
figures_path = glue("{project_path}/figures")
data_results_path = glue("{project_path}/data")

figure_path = glue("{figures_path}/Figure1")

## Load data

In [None]:
data = arrow::read_feather(glue("{dataset_path}/data_merged.feather")) %>% filter(NMR_FLAG == TRUE)
data_description = arrow::read_feather(glue("{dataset_path}/description.feather"))

In [None]:
eids_withdraws = c("XXX")

In [None]:
data = data %>% filter(!eid %in% eids_withdraws) %>% mutate(erectile_dysfunction = case_when(sex=="Female" ~ FALSE, TRUE ~ erectile_dysfunction))
data

In [None]:
covariates = (data_description %>% filter(isTarget==FALSE) %>% filter(based_on!="PGS"))$covariate[-1]
targets = (data_description %>% filter(isTarget==TRUE))$covariate[-1]
pgs = (data_description %>% filter(isTarget==FALSE) %>% filter(based_on=="PGS") %>% filter(!dtype=="Date"))$covariate

In [None]:
data = data %>% mutate_at(c("sex", "overall_health_rating", "smoking_status", "ethnic_background"), as.factor)
data = data %>% mutate(sex=fct_relevel(sex, c("Male", "Female")),
                       overall_health_rating=fct_relevel(overall_health_rating, c("Excellent", "Good", "Fair", "Poor")),
                       smoking_status=fct_relevel(smoking_status, c("Current", "Previous", "Never")))

In [None]:
f = list()
f$basics = c('age_at_recruitment','sex', 'ethnic_background',"townsend_deprivation_index_at_recruitment")
f$questionnaire = c('overall_health_rating','smoking_status')
f$measurements = c('body_mass_index_bmi','weight',"standing_height",'systolic_blood_pressure','diastolic_blood_pressure')
f$labs = c("cholesterol", "hdl_cholesterol", "ldl_direct","triglycerides")
f$family_history = c('fh_heart_disease')
f$diagnoses = c("diabetes1", "diabetes2", "chronic_kidney_disease", "atrial_fibrillation", "migraine", 
                      "rheumatoid_arthritis", "systemic_lupus_erythematosus", "severe_mental_illness", "erectile_dysfunction")
f$medications = c("antihypertensives", "ass", "atypical_antipsychotics", "glucocorticoids")
f$metabolomics = c(
'NMR_3hydroxybutyrate',
 'NMR_acetate',
 'NMR_acetoacetate',
 'NMR_acetone',
 'NMR_alanine',
 'NMR_albumin',
 'NMR_apolipoprotein_a1',
 'NMR_apolipoprotein_b',
 'NMR_average_diameter_for_hdl_particles',
 'NMR_average_diameter_for_ldl_particles',
 'NMR_average_diameter_for_vldl_particles',
 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 'NMR_cholesterol_in_idl',
 'NMR_cholesterol_in_large_hdl',
 'NMR_cholesterol_in_large_ldl',
 'NMR_cholesterol_in_large_vldl',
 'NMR_cholesterol_in_medium_hdl',
 'NMR_cholesterol_in_medium_ldl',
 'NMR_cholesterol_in_medium_vldl',
 'NMR_cholesterol_in_small_hdl',
 'NMR_cholesterol_in_small_ldl',
 'NMR_cholesterol_in_small_vldl',
 'NMR_cholesterol_in_very_large_hdl',
 'NMR_cholesterol_in_very_large_vldl',
 'NMR_cholesterol_in_very_small_vldl',
 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl',
 'NMR_cholesteryl_esters_in_hdl',
 'NMR_cholesteryl_esters_in_idl',
 'NMR_cholesteryl_esters_in_ldl',
 'NMR_cholesteryl_esters_in_large_hdl',
 'NMR_cholesteryl_esters_in_large_ldl',
 'NMR_cholesteryl_esters_in_large_vldl',
 'NMR_cholesteryl_esters_in_medium_hdl',
 'NMR_cholesteryl_esters_in_medium_ldl',
 'NMR_cholesteryl_esters_in_medium_vldl',
 'NMR_cholesteryl_esters_in_small_hdl',
 'NMR_cholesteryl_esters_in_small_ldl',
 'NMR_cholesteryl_esters_in_small_vldl',
 'NMR_cholesteryl_esters_in_vldl',
 'NMR_cholesteryl_esters_in_very_large_hdl',
 'NMR_cholesteryl_esters_in_very_large_vldl',
 'NMR_cholesteryl_esters_in_very_small_vldl',
 'NMR_citrate',
 'NMR_clinical_ldl_cholesterol',
 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles',
 'NMR_concentration_of_hdl_particles',
 'NMR_concentration_of_idl_particles',
 'NMR_concentration_of_ldl_particles',
 'NMR_concentration_of_large_hdl_particles',
 'NMR_concentration_of_large_ldl_particles',
 'NMR_concentration_of_large_vldl_particles',
 'NMR_concentration_of_medium_hdl_particles',
 'NMR_concentration_of_medium_ldl_particles',
 'NMR_concentration_of_medium_vldl_particles',
 'NMR_concentration_of_small_hdl_particles',
 'NMR_concentration_of_small_ldl_particles',
 'NMR_concentration_of_small_vldl_particles',
 'NMR_concentration_of_vldl_particles',
 'NMR_concentration_of_very_large_hdl_particles',
 'NMR_concentration_of_very_large_vldl_particles',
 'NMR_concentration_of_very_small_vldl_particles',
 'NMR_creatinine',
 'NMR_degree_of_unsaturation',
 'NMR_docosahexaenoic_acid',
 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 'NMR_free_cholesterol_in_hdl',
 'NMR_free_cholesterol_in_idl',
 'NMR_free_cholesterol_in_ldl',
 'NMR_free_cholesterol_in_large_hdl',
 'NMR_free_cholesterol_in_large_ldl',
 'NMR_free_cholesterol_in_large_vldl',
 'NMR_free_cholesterol_in_medium_hdl',
 'NMR_free_cholesterol_in_medium_ldl',
 'NMR_free_cholesterol_in_medium_vldl',
 'NMR_free_cholesterol_in_small_hdl',
 'NMR_free_cholesterol_in_small_ldl',
 'NMR_free_cholesterol_in_small_vldl',
 'NMR_free_cholesterol_in_vldl',
 'NMR_free_cholesterol_in_very_large_hdl',
 'NMR_free_cholesterol_in_very_large_vldl',
 'NMR_free_cholesterol_in_very_small_vldl',
 'NMR_glucose',
 'NMR_glutamine',
 'NMR_glycine',
 'NMR_glycoprotein_acetyls',
 'NMR_hdl_cholesterol',
 'NMR_histidine',
 'NMR_isoleucine',
 'NMR_ldl_cholesterol',
 'NMR_lactate',
 'NMR_leucine',
 'NMR_linoleic_acid',
 'NMR_monounsaturated_fatty_acids',
 'NMR_omega3_fatty_acids',
 'NMR_omega6_fatty_acids',
 'NMR_phenylalanine',
 'NMR_phosphatidylcholines',
 'NMR_phosphoglycerides',
 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl',
 'NMR_phospholipids_in_hdl',
 'NMR_phospholipids_in_idl',
 'NMR_phospholipids_in_ldl',
 'NMR_phospholipids_in_large_hdl',
 'NMR_phospholipids_in_large_ldl',
 'NMR_phospholipids_in_large_vldl',
 'NMR_phospholipids_in_medium_hdl',
 'NMR_phospholipids_in_medium_ldl',
 'NMR_phospholipids_in_medium_vldl',
 'NMR_phospholipids_in_small_hdl',
 'NMR_phospholipids_in_small_ldl',
 'NMR_phospholipids_in_small_vldl',
 'NMR_phospholipids_in_vldl',
 'NMR_phospholipids_in_very_large_hdl',
 'NMR_phospholipids_in_very_large_vldl',
 'NMR_phospholipids_in_very_small_vldl',
 'NMR_polyunsaturated_fatty_acids',
 'NMR_pyruvate',
 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol',
 'NMR_saturated_fatty_acids',
 'NMR_sphingomyelins',
 'NMR_total_cholesterol',
 'NMR_total_cholesterol_minus_hdlc',
 'NMR_total_cholines',
 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine',
 'NMR_total_concentration_of_lipoprotein_particles',
 'NMR_total_esterified_cholesterol',
 'NMR_total_fatty_acids',
 'NMR_total_free_cholesterol',
 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl',
 'NMR_total_lipids_in_hdl',
 'NMR_total_lipids_in_idl',
 'NMR_total_lipids_in_ldl',
 'NMR_total_lipids_in_large_hdl',
 'NMR_total_lipids_in_large_ldl',
 'NMR_total_lipids_in_large_vldl',
 'NMR_total_lipids_in_lipoprotein_particles',
 'NMR_total_lipids_in_medium_hdl',
 'NMR_total_lipids_in_medium_ldl',
 'NMR_total_lipids_in_medium_vldl',
 'NMR_total_lipids_in_small_hdl',
 'NMR_total_lipids_in_small_ldl',
 'NMR_total_lipids_in_small_vldl',
 'NMR_total_lipids_in_vldl',
 'NMR_total_lipids_in_very_large_hdl',
 'NMR_total_lipids_in_very_large_vldl',
 'NMR_total_lipids_in_very_small_vldl',
 'NMR_total_phospholipids_in_lipoprotein_particles',
 'NMR_total_triglycerides',
 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl',
 'NMR_triglycerides_in_hdl',
 'NMR_triglycerides_in_idl',
 'NMR_triglycerides_in_ldl',
 'NMR_triglycerides_in_large_hdl',
 'NMR_triglycerides_in_large_ldl',
 'NMR_triglycerides_in_large_vldl',
 'NMR_triglycerides_in_medium_hdl',
 'NMR_triglycerides_in_medium_ldl',
 'NMR_triglycerides_in_medium_vldl',
 'NMR_triglycerides_in_small_hdl',
 'NMR_triglycerides_in_small_ldl',
 'NMR_triglycerides_in_small_vldl',
 'NMR_triglycerides_in_vldl',
 'NMR_triglycerides_in_very_large_hdl',
 'NMR_triglycerides_in_very_large_vldl',
 'NMR_triglycerides_in_very_small_vldl',
 'NMR_tyrosine',
 'NMR_vldl_cholesterol',
 'NMR_valine')

In [None]:
library(ggforestplot)
ng_names = df_NG_biomarker_metadata %>% mutate(metabolite = str_replace_all(tolower(description), " ", "_"))
ng_names %>% sample_n(10)

In [None]:
library(fuzzyjoin)
mets1 = tibble(metabolite=f$metabolomics) %>% mutate(metabolite=str_remove_all(metabolite, "NMR_")) %>% distinct() %>% select(metabolite) %>% distinct() %>% left_join(ng_names, by = "metabolite")
mets2 = mets1 %>% filter(is.na(name)) %>% select(metabolite) %>% stringdist_left_join(ng_names, by = "metabolite", max_dist = 1) %>% 
    rename(metabolite = metabolite.x) %>% select(-metabolite.y) %>% distinct()
mets3 = mets2 %>% filter(is.na(name)) %>% select(metabolite) %>% stringdist_left_join(ng_names, by = "metabolite", max_dist = 8) %>% 
    rename(metabolite = metabolite.x) %>% select(-metabolite.y) %>% distinct()
mets = bind_rows(mets1 %>% filter(!is.na(name)), mets2 %>% filter(!is.na(name)), mets3) %>% arrange(group, subgroup, description)
mets %>% head()

metabolites = paste0("NMR_", mets$metabolite)
names(metabolites) = mets$description

In [None]:
PANEL = c( # 38 clinical predictors
    # basics
    "age_at_recruitment", 
    "sex", 
   # "ethnic_background", # added
    "education_years",
    'smoking_status', # current smoker
    'alcohol_intake_frequency', # 'Daily or almost daily'
    "daily_physical_activity",

    "daily_healthy_food",
    
    # family history
    "fh_diabetes",
    
    # diagnoses
    "diabetes2",
        
    # physical
    "weight", 
    "standing_height", 
    "body_mass_index_bmi", 
    'waist_hip_ratio',
    "waist_circumference",
    "systolic_blood_pressure", 
    
    # lipids
    "cholesterol", 
    "ldl_direct", 
    "hdl_cholesterol",
    "triglycerides",
    
    # diabetes
    'glucose',
    'glycated_haemoglobin_hba1c',
    
    # kidney
    'creatinine',
    'cystatin_c',
    'urea',
    'urate',
        
    # liver
    'aspartate_aminotransferase',
    'alanine_aminotransferase',
    'alkaline_phosphatase',
    'albumin',
    
    # inflammation
    'creactive_protein',
    
    # Blood counts
    'red_blood_cell_erythrocyte_count',
    'white_blood_cell_leukocyte_count',
    'platelet_count',
    'haemoglobin_concentration',
    'haematocrit_percentage',
        'mean_corpuscular_volume',
    'mean_corpuscular_haemoglobin',
    'mean_corpuscular_haemoglobin_concentration',
    
    # medications
    'antihypertensives'
        )

In [None]:
extra_data = arrow::read_feather(glue("{data_results_path}/extra_data_211015.feather"))
table_data = data %>% left_join(extra_data, by="eid") %>% select(all_of(PANEL))

In [None]:
library(gtsummary)

In [None]:
table1 = table_data %>% mutate(education_years=jitter(as.numeric(as.character(education_years)), 0.0001), 
                               smoking_status=case_when(smoking_status=="Current" ~ 1, TRUE ~ 0),
                            alcohol_intake_frequency=case_when(alcohol_intake_frequency=="Daily or almost daily" ~ 1, TRUE ~ 0) 
                              ) %>%
    tbl_summary(by="sex", 
                                    label=list(
                                       age_at_recruitment ~"Age at Recruitment",
                                        #ethnic_background ~"Ethnicity",
                                       smoking_status ~ "Current Smoker",
                                       alcohol_intake_frequency ~ "Daily Alcohol Intake",
                                       daily_physical_activity ~ "Daily Moderate to Vigorous Physical Activity",
                                       education_years ~ "Education years",
                                       daily_healthy_food ~ "Daily Healthy Food",
                                       fh_diabetes ~ "Family History Diabetes",
                                       diabetes2 ~ "Type 2 Diabetes",
                                       body_mass_index_bmi ~ "BMI",
                                       waist_hip_ratio ~ "Waist-Hip-Ratio",
                                       waist_circumference ~ "Waist Circumference",
                                       weight ~ "Weight (kg)",
                                       standing_height ~ "Standing Height (cm)",
                                       systolic_blood_pressure ~ "Systolic Blood Pressure (mmHg)",
                                    cholesterol~"Total Cholesterol (mmol/L)",
                                       hdl_cholesterol ~"HDL Cholesterol (mmol/L)",
                                       ldl_direct~"LDL Cholesterol (mmol/L)",
                                       triglycerides~"Triglycerides (mmol/L)",                                   
                                       glucose ~ "Glucose (mmol/L)",
                                       glycated_haemoglobin_hba1c ~ "Glycated Hemoglobin (%)",
                                       creatinine ~ "Creatinine (umol/L)",
                                       cystatin_c ~ "Cystatin C (mg/L)",
                                       urea ~ "Urea (mmol/L)", 
                                       urate ~ "Urate (umol/L)",
                                       aspartate_aminotransferase ~ "Aspartate Aminotransferase (U/L)",
                                       alanine_aminotransferase ~ "Alanine Aminotransferase (U/L)",
                                       alkaline_phosphatase ~ "Alkaline Phosphatase (U/L)",
                                       albumin ~ "Albumin (g/L)",
                                       creactive_protein ~ "C-Reactive Protein (mg/L)",
                                       red_blood_cell_erythrocyte_count ~ "Erythrocytes (10^12 cells/L)",
                                       white_blood_cell_leukocyte_count ~ "Leucocytes (10^9 cells/L)",
                                       platelet_count ~ "Platelets (10^9 cells/L)",
                                       haemoglobin_concentration ~ "Haemoglobin (g/dL)",
                                       haematocrit_percentage ~ "Haematocrit (%)",
                                         mean_corpuscular_volume ~ "Mean Corpuscular Volume",
                                       mean_corpuscular_haemoglobin ~ "Mean Corpuscular Haemaglobin (pg)",
                                        mean_corpuscular_haemoglobin_concentration ~ "Mean Corpuscular Haemaglobin (g/dL)",
                                       antihypertensives ~"Antihypertensives"
                                              ),
                                    missing="no"
                                   ) %>% add_overall(last=TRUE) %>% bold_labels()

In [None]:
library(gt)
plot_name = "Table1"
table1 %>% as_gt() %>% tab_header(title = "Table 1") %>% 
    gt::gtsave(glue("{plot_name}.html"))

# Observation time

In [None]:
plot_width=10; plot_height=3; plot_dpi=300
options(repr.plot.width=plot_width, repr.plot.height=plot_height)

theme_set(theme_classic(base_size = base_size) + 
    theme(strip.background = element_blank(), plot.title=element_text(size=25, hjust=0), 
          strip.text.x = element_text(size = facet_size)) + theme(legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))


In [None]:
temp = data %>% filter(MACE_event==0)  %>% select(c(eid, MACE_event_time))
mean = round((temp %>% summarise(mean=median(MACE_event_time)))$mean, 1)
obs_time = ggplot(temp, aes(x=MACE_event_time)) + ggtitle("Observation Time") + 
    geom_density(fill="gray70", alpha=0.5) +
    labs(x="Years", y="Density") +
    geom_vline(aes(xintercept=mean(MACE_event_time)),color="black", linetype="dashed", size=1)+
    #geom_text(x=mean, label=mean, y=0.15, hjust=-0.5)+
    #ylab("Prevalence in [%]") +
    scale_x_continuous(expand=c(0,0))+
    scale_y_continuous(expand=c(0,0))+
    coord_cartesian(xlim=c(0, 15))+
    annotate("text", x=mean-5, y=0.2, label=paste0("Median: ", mean, " years"), size = geom_text_size)
obs_time

# Aggregated person years

In [None]:
py_data = data %>% select("eid", all_of(paste0(names(endpoint_map), "_event_time"))) %>% pivot_longer(-eid, names_to="endpoint", values_to="person_years") %>% mutate(endpoint = str_remove_all(endpoint, "_event"))

In [None]:
py_data %>% group_by(eid) %>% arrange(desc(person_years)) %>% slice(1) %>% ungroup() %>% summarise(sum_years=sum(person_years))

# Endpoint frequencies

In [None]:
library(ggthemes)
endpoint_map = c(
    'M_MACE'='MACE',
    'M_all_cause_dementia'='Dementia',
    'M_type_2_diabetes'='T2 Diabetes',
    'M_liver_disease'='Liver Disease',
    'M_renal_disease'='Renal Disease',
    'M_atrial_fibrillation'='Atrial Fibrillation',
    'M_heart_failure'= 'Heart Failure',
    'M_coronary_heart_disease'='CHD',
    'M_venous_thrombosis'='Ven. Thrombosis',
    'M_cerebral_stroke'='Cerebral Stroke',
    'M_abdominal_aortic_aneurysm'='AAA',
    'M_peripheral_arterial_disease'='PAD',
    "M_chronic_obstructuve_pulmonary_disease" = "COPD",
    "M_asthma" = "Asthma",
    'M_parkinsons_disease' = "Parkinson's",    
    "M_lung_cancer" = "Lung Cancer",
    "M_non_melanoma_skin_cancer" = "Skin Cancer",
    "M_colon_cancer"= "Colon Cancer",
    "M_rectal_cancer" = "Rectal Cancer",
    "M_prostate_cancer"= "Prostate Cancer",
    "M_breast_cancer" = "Breast Cancer",
    'M_cataracts' = "Cataracts", 
    'M_glaucoma' = "Glaucoma",
    'M_fractures' = "Fractures"
)

endpoint_order = c("M_MACE", "M_coronary_heart_disease", "M_cerebral_stroke", "M_all_cause_dementia", "M_heart_failure", "M_atrial_fibrillation",
                   "M_type_2_diabetes", "M_liver_disease", "M_renal_disease", "M_peripheral_arterial_disease", "M_venous_thrombosis",  "M_abdominal_aortic_aneurysm",
                   "M_chronic_obstructuve_pulmonary_disease", "M_asthma", 'M_parkinsons_disease', 'M_cataracts', 'M_glaucoma', 'M_fractures',
                    "M_lung_cancer","M_non_melanoma_skin_cancer","M_colon_cancer","M_rectal_cancer","M_prostate_cancer","M_breast_cancer"
                   
)

In [None]:
temp_endpoints = data %>% select(starts_with("M_")) %>% select(ends_with("_event"), -contains("-time"), -contains("comp")) %>% 
    pivot_longer(everything()) %>% group_by(name) %>% summarise(frequency=mean(value)) %>% arrange(frequency) %>% ungroup() %>% as.data.frame() %>%
    mutate(name = str_remove_all(name, "_event")) %>% filter(name %in% endpoints) %>% mutate(name = factor(name, levels=endpoint_order))

In [None]:
inc_data = data %>% select("eid", all_of(paste0(names(endpoint_map), "_event"))) %>% pivot_longer(-eid, names_to="endpoint", values_to="Incident") %>% mutate(endpoint = str_remove_all(endpoint, "_event"))
prev_data = data %>% select(eid, all_of(names(endpoint_map))) %>% pivot_longer(-eid, names_to="endpoint", values_to="Prevalent") %>% mutate(Prevalent = as.integer(Prevalent))
agg_data = prev_data %>% left_join(inc_data, by=c("eid", "endpoint")) 
agg_data$endpoint = recode(agg_data$endpoint, !!!endpoint_map)
agg_data %>% head()

In [None]:
agg_data %>% filter(Prevalent==0) %>% group_by(endpoint) %>% summarise(frequency=sum(Incident)/n()) %>% arrange(frequency) %>% mutate(f = round(frequency*100, 2)) # %>% mutate(sum=n()) %>% group_by(frequency, sum) %>% tally() %>% mutate(freq=n*100/sum)