# 4. Scores

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(data.table)
library(glue)
library(jsonlite)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

dataset_name = "210714_metabolomics"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = glue("{base_path}/data")
dataset_path = glue("{data_path}/3_datasets_post/{dataset_name}")

project_label="21_metabolomics_multitask"
project_path = glue("{base_path}/results/projects/{project_label}")
figures_path = glue("{project_path}/figures")
data_results_path = glue("{project_path}/data")

In [None]:
#data = arrow::read_feather(glue("{dataset_path}/data_merged.feather"))
description = arrow::read_feather(glue("{dataset_path}/description.feather"))

In [None]:
description %>% filter

In [None]:
files = c()
for (i in 0:21){
    files = c(files, glue("{dataset_path}/partition_{i}/test/data_imputed.feather"))
}

In [None]:
data_master = map(files, arrow::read_feather) %>% bind_rows()

In [None]:
data_master %>% count(eid) %>% summarize(sum = sum(n), max= max(n))

In [None]:
data = copy(data_master)

## Additional data

In [None]:
fields = arrow::read_feather(glue("{base_path}/data/1_decoded/ukb_data_field_210517.feather"))

In [None]:
decoded = arrow::read_feather(glue("{base_path}/data/1_decoded/ukb_data_210517.feather"))

In [None]:
fields %>% filter(str_detect(col.name, "school")) %>% filter(str_detect(col.name, "_0_0"))

In [None]:
eid_link = fread("ssh steinfej@172.16.120.129 cat /data/analysis/uk_bb/UKBB.application.link.file.20210608.txt")

In [None]:
head(eid_link)

In [None]:
eid_link %>% count(EID.44448) %>% summarize(sum = sum(n), max=max(n))

In [None]:
eid_link %>% group_by(sex.49966) %>% distinct() %>% tally()

### APOE4 - CAIDE (Dementia)

In [None]:
apoe4 = fread("ssh steinfej@172.16.120.129 cat /data/analysis/uk_bb/intergenics/development/hollmann/data/apoe4.csv")

In [None]:
apoe4_clean = apoe4 %>% distinct() %>% left_join(eid_link %>% select(EID.49966, sex.49966, EID.51157), by=c("index"="EID.49966")) %>% 
    left_join(data %>% select(eid, sex), by=c("EID.51157"="eid")) %>% filter(sex==sex) %>% rename("eid"="EID.51157") %>% select(eid, rs429358, rs7412)
head(apoe4_clean)

In [None]:
data = data_master %>% left_join(apoe4_clean, by="eid")

In [None]:
# The imputed genotypes are aligned to the + strand of the reference and the positions are in GRCh37 coordinates
# rs7412 C>T
# rs429358 T>C

In [None]:
data = data %>% mutate(apoe = case_when(
    rs429358>=1&rs7412>=1 ~ "e1",
    rs429358==0&rs7412>=1 ~ "e2",
    rs429358==0&rs7412==0 ~ "e3",
    rs429358>=1&rs7412==0 ~ "e4",
    TRUE ~ "NA")) 

In [None]:
# check allele frequencies
data %>% select(eid, rs429358, rs7412) %>% summarise(rs429358=mean(rs429358, na.rm=TRUE), rs7412 = mean(rs7412, na.rm=TRUE))

In [None]:
# check carrier frequencies
data_check = data %>% select(eid, rs429358, rs7412, M_all_cause_dementia_event) %>% 
    mutate(apoe = case_when(
        rs429358>=1&rs7412>=1 ~ "e1",
        rs429358==0&rs7412>=1 ~ "e2",
        rs429358==0&rs7412==0 ~ "e3",
        rs429358>=1&rs7412==0 ~ "e4",
        TRUE ~ "NA")) %>% 
    group_by(apoe)

In [None]:
data %>% count(eid) %>% summarize(sum = sum(n), max= max(n))

In [None]:
data_apoe4 = data_check %>% select(eid, all_of("apoe")) %>% ungroup() %>% mutate(apoe4 = case_when(apoe=="e4" ~ TRUE, TRUE ~FALSE)) %>% select(eid, apoe4)# %>% summarise(mean(apoe4))

In [None]:
data_apoe4

In [None]:
data_apoe4 %>% arrow::write_feather(glue("{data_results_path}/data_apoe4_220104.feather"))