# 1. Standardize Codes into our Standard OMOP Concepts

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(glue)
library(data.table)
library(tidyfast)
library("magrittr")
setwd("/")
base_path = "/sc-projects/sc-proj-ukb-cvd/data"
data_path = glue("{base_path}/0_raw/showcase_48024/tables_220317")
mapping_path = glue("{base_path}/mapping")
out_path = glue("{base_path}/1_decoded")
#data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"

In [None]:
library(dtplyr)

## Schema

## Load Athena Vocabulary

In [None]:
vocab_dir = glue("{mapping_path}/athena")
concept =fread(glue("{vocab_dir}/CONCEPT.csv"), sep='\t', quote = "")

In [None]:
relationship = fread(glue("{vocab_dir}/RELATIONSHIP.csv"), sep='\t')

In [None]:
vocabulary =  fread(glue("{vocab_dir}/VOCABULARY.csv"), sep='\t')

In [None]:
concept_relationship = fread(glue("{vocab_dir}/CONCEPT_RELATIONSHIP.csv"), sep='\t')

# Load all data and check standard adherence

## GP

In [None]:
# clinical
gp_clinical = arrow::read_feather(glue("{out_path}/codes_gp_clinical_220317.feather")) 

In [None]:
gp_clinical %>% head()

In [None]:
# prescriptions
gp_scripts = arrow::read_feather(glue('{out_path}/codes_gp_presciptions_220407.feather'))

In [None]:
gp_scripts %>% head()

## HES/ONS

In [None]:
hes_diagnoses = arrow::read_feather(glue("{out_path}/codes_hes_diagnoses_220328.feather"))
hes_diagnoses %>% head()

In [None]:
hes_procedures = arrow::read_feather(glue("{out_path}/codes_hes_procedures_220328.feather"))
hes_procedures %>% head()

In [None]:
death_diagnoses = arrow::read_feather(glue("{out_path}/codes_death_records_220328.feather"))
death_diagnoses %>% head()

### bind data

In [None]:
records_raw = bind_rows(hes_diagnoses, hes_procedures, death_diagnoses, gp_clinical, gp_scripts) %>% 
    mutate(origin = factor(origin), 
           vocabulary_origin=factor(vocabulary_origin), 
           code_origin=factor(code_origin), 
           vocabulary=factor(vocabulary), 
           code=factor(code)) %>%
    select(eid, origin, vocabulary_origin, code_origin, vocabulary, code, date)

In [None]:
records_raw %>% sample_n(10)

In [None]:
# check that no records are clean!
records_future = records_raw %>% filter(date > "2022-03-28") 
records_future

In [None]:
record_codes = records_raw %>% select(vocabulary, code) %>% distinct()

In [None]:
record_codes_md = record_codes %>% 
    left_join(concept, by=c("vocabulary" = "vocabulary_id", 
                            "code"="concept_code"))

In [None]:
record_codes_md %>% filter(!standard_concept %in% c("S", "C"))

In [None]:
concept_ids_in = record_codes_md %>% select(vocabulary, code, concept_id) %>% as_tibble()
concept_ids_out = concept %>% 
    filter(vocabulary_id %in% c("SNOMED", "RxNorm", "CVX"), standard_concept %in% c("S", "C")) %>% 
    as_tibble()

In [None]:
concept_ids = concept_ids_in %>% mutate(concept_id_1 = concept_id)
cr_filtered = concept_relationship %>% 
    filter(concept_id_1 %in% concept_ids_in$concept_id) %>% 
    filter(concept_id_2 %in% concept_ids_out$concept_id) %>% 
    filter(relationship_id %in% c("Maps to")) %>%
    arrange(concept_id_1) %>% as_tibble()

In [None]:
mapping_omop =  concept_ids_in %>% 
    left_join(cr_filtered %>% select(concept_id_1, relationship_id, concept_id_2), by=c("concept_id"="concept_id_1")) %>%
    left_join(concept_ids_out %>% 
              select(concept_id, vocabulary_id, concept_code, concept_name, domain_id), 
              by=c("concept_id_2"="concept_id")) %>%
    rename(omop_id=concept_id_2) %>% 
    #select(vocabulary, code, omop_id, concept_name) %>%
    distinct() %>% filter(!is.na(omop_id))
    #mutate(code = concept_code.x, omop_id=concept_id_2.y) %>%
    #select(code, omop_id) %>% distinct() #%>% filter(!is.na(omop_id))

In [None]:
records_omop = records_raw %>% 
    left_join(mapping_omop %>% 
              select(vocabulary, code, omop_id), 
              by=c("vocabulary", "code")
              )

In [None]:
concept_factor = concept %>% mutate(
    concept_name=factor(concept_name), 
    domain_id = factor(domain_id),
    vocabulary_id=factor(vocabulary_id),
    concept_class_id=factor(concept_class_id),
    standard_concept = factor(standard_concept),
    concept_code = factor(concept_code)) %>%
    select(concept_id, concept_name, domain_id, vocabulary_id, concept_class_id, standard_concept, concept_code) %>% 
    as_tibble()

In [None]:
record_omop_md = records_omop %>% 
    left_join(concept_factor, by=c("omop_id"="concept_id"))

In [None]:
record_omop_md = record_omop_md %>% filter(!is.na(omop_id))

In [None]:
domain_selection = c("Observation", "Condition", "Procedure", "Drug", "Device")

In [None]:
record_omop_filtered = record_omop_md %>% 
    filter(domain_id %in% domain_selection) %>%
    mutate(code=factor(code), concept_id=omop_id) %>%
    select(eid, origin, vocabulary_origin, code_origin, date, concept_id, code, concept_name, domain_id, concept_class_id, vocabulary_id) 

In [None]:
arrow::write_feather(record_omop_filtered, glue("{out_path}/dataportal_records_omop_220407.feather"))