# 2. Exploration

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(glue)
library(cowplot)
library(survminer)
library(survival)
library(ggsci)
library(arsenal)
library(yaml)

#setwd("/")
#path = "/home/steinfej/projects/uk_biobank/"
#dataset_path = "data/datasets/cvd_massive_excl_emb_ind"

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = 220531
experiment_path = glue("{output_path}/{experiment}")

## Load data

In [None]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [None]:
data_outcomes = arrow::read_feather(glue("{output_path}/baseline_outcomes_220531.feather", as_data_frame=FALSE))

In [None]:
dataset_path = "/sc-projects/sc-proj-ukb-cvd/data/3_datasets_post/210616_centres_dask"

In [None]:
data = arrow::read_feather(glue("{dataset_path}/data_merged.feather")) %>% filter(eid %in% unique(data_outcomes$eid)) %>% select(!ends_with("_event")&!ends_with("_event_time"))
data_description = arrow::read_feather(glue("{dataset_path}/description.feather"))

In [None]:
data = data %>% 
    mutate(erectile_dysfunction = case_when(sex=="Female" ~ FALSE, TRUE ~ erectile_dysfunction))

In [None]:
data = data %>% mutate_at(c("sex", "overall_health_rating", "smoking_status", "ethnic_background"), as.factor)
data = data %>% mutate(sex=fct_relevel(sex, c("Male", "Female")),
                       overall_health_rating=fct_relevel(overall_health_rating, c("Excellent", "Good", "Fair", "Poor")),
                       smoking_status=fct_relevel(smoking_status, c("Current", "Previous", "Never")))

In [None]:
f = list()
f$basics = c('age_at_recruitment','sex', 'ethnic_background',"townsend_deprivation_index_at_recruitment")
f$questionnaire = c('overall_health_rating','smoking_status')
f$measurements = c('body_mass_index_bmi','weight',"standing_height",'systolic_blood_pressure','diastolic_blood_pressure')
f$labs = c("cholesterol", "hdl_cholesterol", "ldl_direct","triglycerides")
f$family_history = c('fh_heart_disease')
f$diagnoses = c("diabetes1", "diabetes2", "chronic_kidney_disease", "atrial_fibrillation", "migraine", 
                      "rheumatoid_arthritis", "systemic_lupus_erythematosus", "severe_mental_illness", "erectile_dysfunction")
f$medications = c("antihypertensives", "ass", "atypical_antipsychotics", "glucocorticoids")

In [None]:
PANEL = c( # 38 clinical predictors
    # basics
    "age_at_recruitment", 
    "sex", 
    "ethnic_background",# # added
   # "education_years",
    'smoking_status', # current smoker
    #'alcohol_intake_frequency', # 'Daily or almost daily'
  #  "daily_physical_activity",

    #"daily_healthy_food",
    
    # family history
   # "fh_diabetes",
    
    # diagnoses
   # "diabetes2",
        
    # physical
    "weight", 
    "standing_height", 
    "body_mass_index_bmi",
   # 'waist_hip_ratio',
    #"waist_circumference",
    "systolic_blood_pressure",
    
    # lipids
   # "cholesterol", 
    "ldl_direct", 
    #"hdl_cholesterol",
    #"triglycerides",
    
    # diabetes
    'glucose'
   # 'glycated_haemoglobin_hba1c',
    
    # kidney
  #  'creatinine',
  #  'cystatin_c',
  #  'urea',
  #  'urate',
        
    # liver
  #  'aspartate_aminotransferase',
  #  'alanine_aminotransferase',
   # 'alkaline_phosphatase',
   # 'albumin',
    
    # inflammation
   # 'creactive_protein',
    
    # Blood counts
   # 'red_blood_cell_erythrocyte_count',
   # 'white_blood_cell_leukocyte_count',
   # 'platelet_count',
   # 'haemoglobin_concentration',
   # 'haematocrit_percentage',
   #     'mean_corpuscular_volume',
   # 'mean_corpuscular_haemoglobin',
  #  'mean_corpuscular_haemoglobin_concentration',
    
    # medications
  #  'antihypertensives'
        )

# Baseline Characteristics - Table 1

In [None]:
table_data = data %>% select(all_of(PANEL))

In [None]:
table_data

In [None]:
library(gtsummary)
table1 = table_data %>% 
    mutate(smoking_status=case_when(smoking_status=="Current" ~ 1, TRUE ~ 0)#,
           #alcohol_intake_frequency=case_when(alcohol_intake_frequency=="Daily or almost daily" ~ 1, TRUE ~ 0)
          ) %>%
    tbl_summary(by="sex", 
                label=list(
                    age_at_recruitment ~"Age at Recruitment",
                    ethnic_background ~"Ethnicity",
                    smoking_status ~ "Current Smoker",
                    #alcohol_intake_frequency ~ "Daily Alcohol Intake",
                    body_mass_index_bmi ~ "BMI",
                    weight ~ "Weight (kg)",
                    standing_height ~ "Standing Height (cm)",
                    systolic_blood_pressure ~ "Systolic Blood Pressure (mmHg)",
                    ldl_direct ~ "LDL Cholesterol ()",
                    glucose ~ "Glucose ()"
                ),
                missing="no") %>% 
    add_overall(last=TRUE) %>% 
    bold_labels()

In [None]:
library(gt)
plot_name = "Table1"
table1 %>% as_gt() %>% 
    #opt_align_table_header(align = "left") %>% 
    gt::gtsave(glue("outputs/{plot_name}.html"))
#table1 %>% as_gt()  %>% tab_header(title = "Table 1") %>% 
#    #opt_align_table_header(align = "left") %>%
 #   gt::gtsave(glue("{figure_path}/{plot_name}.html"))

# Cumulative Person Years

In [None]:
py_data =Nature Biomedical Engineering data_outcomes %>% filter(endpoint == "OMOP_4306655") %>% group_by(eid) %>% arrange(desc(time)) %>% slice(1) %>% ungroup() %>% summarise(sum_years=sum(time))

In [None]:
py_data 

# Endpoint Characteristics

In [None]:
library(data.table)
endpoints_md = fread(glue("{experiment_path}/endpoints.csv"), colClasses = c("phecode"="character"))
endpoints = sort(endpoints_md$endpoint)

In [None]:
endpoint_selection = c(
    "phecode_008",
    "phecode_092-2",
    "phecode_105",
    "phecode_107-2",
    "phecode_164",
    "phecode_202-2",
    "phecode_284",
    "phecode_292",
    "phecode_324-11",
    "phecode_328",
    "phecode_371",
    "phecode_401",
    "phecode_404",
    "phecode_424",
    "phecode_440-11",
    "phecode_468",
    "phecode_474",
    "phecode_522-1",
    "phecode_542-1",
    "phecode_581-1",
    "phecode_583",
    "phecode_665",
    "phecode_705-1",
    "OMOP_4306655"  
    )

In [None]:
endpoint_defs = arrow::read_feather(glue("{output_path}/phecode_defs_220306.feather")) %>% arrange(endpoint)

In [None]:
endpoint_defs = endpoint_defs %>% 
    mutate(name = phecode_string) %>% 
    mutate(name = 
           case_when( 
               endpoint == "phecode_008"~"H. pylori", 
               endpoint == "phecode_092-2"~"Sepsis", 
               endpoint == "phecode_105"~"Breast cancer", 
                endpoint == "phecode_107-2"~"Prostate cancer", 
               endpoint == "phecode_123"~"Malignant plasma cell neoplasms", 
               endpoint == "phecode_164"~"Anemia", 
               endpoint == "phecode_200-1"~"Hypothyroidism", 
               endpoint == "phecode_232"~"Vitamin deficiencies", 
               endpoint == "phecode_284"~"Suicide attempt or self harm", 
               #endpoint == "phecode_287-5"~"Drug-induced psychosis", 
               endpoint == "phecode_324-11"~"Parkinson's",
               endpoint == "phecode_328"~"Dementia", 
               #endpoint == "phecode_404"~"Coronary heart disease", 
               endpoint == "phecode_424"~"Heart failure", 
               endpoint == "phecode_440-11"~"Deep vein thrombosis", 
               endpoint == "phecode_468"~"Pneumonia", 
               endpoint == "phecode_474"~"COPD", 
               endpoint == "phecode_518"~"Appendicitis", 
               endpoint == "phecode_542-1"~"Fibrosis and cirrhosis of liver", 
               endpoint == "phecode_583"~"Chronic kidney disease", 
               endpoint == "phecode_705-1"~"Rheumatoid arthritis", 
               endpoint == "phecode_908-1"~"(Pre)eclampsia", 
               #endpoint == "phecode_976"~"Complication of anesthesia",
               TRUE ~ name)
           )

endpoint_map = endpoint_defs$name
names(endpoint_map) =  endpoint_defs$endpoint
endpoint_order = (endpoint_defs %>% arrange(as.numeric(phecode)))$endpoint

In [None]:
icd_to_phecode_map = fread("/sc-projects/sc-proj-ukb-cvd/data/mapping/phecodes/ICD10_to_phecode_V2.csv", 
                           colClasses = c("phecode"="character")) %>%
     group_by(phecode) %>% summarise(ICD = str_c(icd10, collapse=", "))

In [None]:
endpoint_summary = endpoints_md %>% 
    #filter(endpoint %in% endpoint_selection) %>% 
    select(phecode, phecode_string, eligable, n, freq) %>%
    left_join(icd_to_phecode_map)

In [None]:
endpoint_summary %>% write_csv("outputs/endpoints.csv")