# 1. Data Portal Preprocessing

In [None]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(glue)
library(data.table)
library(tidyfast)
library("magrittr")
setwd("/")
base_path = "/sc-projects/sc-proj-ukb-cvd/data"
data_path = glue("{base_path}/0_raw/showcase_48024/tables_220317")
mapping_path = glue("{base_path}/mapping")
out_path = glue("{base_path}/1_decoded")
#data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"

## Schema

In [None]:
list.files(path = data_path)

In [None]:
print(system(glue("ls -ll {data_path}"), intern=TRUE))

## Load Athena Vocabulary

In [None]:
vocab_dir = glue("{mapping_path}/athena")
concept =fread(glue("{vocab_dir}/CONCEPT.csv"), sep='\t', quote = "")

In [None]:
relationship = fread(glue("{vocab_dir}/RELATIONSHIP.csv"), sep='\t')

In [None]:
vocabulary =  fread(glue("{vocab_dir}/VOCABULARY.csv"), sep='\t')

In [None]:
concept_relationship = fread(glue("{vocab_dir}/CONCEPT_RELATIONSHIP.csv"), sep='\t')

In [None]:
concept_ancestor = fread(glue("{vocab_dir}/CONCEPT_ANCESTOR.csv"), sep='\t')

In [None]:
rxconso = fread(glue("{mapping_path}/umls/RXNCONSO_220328.RRF"))

In [None]:
rxconso# %>% filter(V12=="RxNorm")

## Hospital Episode Statistics

In [None]:
hesin = fread(glue("{data_path}/hesin.txt"))

In [None]:
hesin_diag = fread(glue("{data_path}/hesin_diag.txt"))

In [None]:
hesin_critical = fread(glue("{data_path}/hesin_critical.txt")) #fread("/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_critical.txt")

In [None]:
#hesin_psych = fread(glue("{data_path}/hesin_psych.txt"))

In [None]:
#hesin_delivery = fread(glue("{data_path}/hesin_delivery.txt"))
#hesin_maternity = fread(glue("{data_path}/hesin_maternity.txt"))

### Diagnoses - ICD10

In [None]:
## icd9 to icd10 mapping
icd9to10_df = fread(glue("{mapping_path}/codings/coding1836.tsv"))
icd9to10_mapping = split(icd9to10_df$meaning, icd9to10_df$coding)
hesin_diag_icd9 = hesin_diag %>% filter(diag_icd9!="") %>% dplyr::rowwise() %>% mutate(diag_icd10 = list(icd9to10_mapping[[diag_icd9]])) %>% drop_na(diag_icd10)
hesin_diag = rbind(hesin_diag %>% filter(diag_icd9=="") %>% mutate(origin="hes_icd10"), hesin_diag_icd9  %>% mutate(origin="hes_icd9"))

In [None]:
hes_join = hesin[hesin_diag, on=c("eid", "ins_index")]
hes_join = hes_join[, c("eid", "origin","ins_index", "arr_index", "level", "epistart", "diag_icd10")][order(eid, ins_index, arr_index),]

In [None]:
hes_join_date = hes_join %>% rename(date="epistart") %>% mutate(date = ymd(as.Date(fast_strptime(date, "%d/%m/%Y"))))

In [None]:
hes_diagnoses_pre = hes_join_date %>% drop_na(date) %>% rename(code = "diag_icd10") %>% mutate(instance=ins_index) %>% group_by(eid) %>% mutate(n = arr_index)

In [None]:
hes_diagnoses_pre = hes_diagnoses_pre %>% 
    mutate(vocabulary_origin="ICD10", code_origin=unlist(code), level_origin=level) %>% 
    select(eid, origin, vocabulary_origin, code_origin, level_origin, date) %>% ungroup()

In [None]:
nrow(hes_diagnoses_pre)
head(hes_diagnoses_pre %>% arrange(desc(date)))

In [None]:
concept_ids_in = concept %>% filter(vocabulary_id == "ICD10") %>% mutate(concept_code = str_replace(concept_code, "\\.", ""))
concept_ids_out = concept %>% filter(vocabulary_id == "SNOMED" & standard_concept %in% c("S", "C")) 

# check necessary opcs4 concept ids
concept_ids = concept_ids_in %>% mutate(concept_id_1 = concept_id)

cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_in$concept_id) %>% filter(concept_id_2 %in% concept_ids_out$concept_id) %>% arrange(concept_id_1)

In [None]:
mapping_out = concept_ids_in %>% 
    left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c("concept_id"="concept_id_1")) %>% 
    left_join(concept_ids_out %>% select(concept_id, concept_code, concept_name), by=c("concept_id_2"="concept_id")) %>% 
    mutate(code_origin = concept_code.x, code=concept_code.y, name=concept_name.y)

In [None]:
hes_diagnoses = hes_diagnoses_pre %>% 
    left_join(mapping_out %>% select(code_origin, code, name), by="code_origin") %>% 
    ungroup() %>% filter(!is.na(date)&!is.na(code)) %>% 
    mutate(vocabulary="SNOMED") %>%
    select(eid, origin, vocabulary_origin, code_origin, level_origin, vocabulary, code, name, date) %>%
    mutate(origin=factor(origin), vocabulary_origin=factor(vocabulary_origin), 
           code_origin=factor(code_origin), vocabulary=factor(vocabulary), code=factor(code), name=factor(name))

In [None]:
nrow(hes_diagnoses)
head(hes_diagnoses %>% arrange(desc(date)))

In [None]:
arrow::write_feather(hes_diagnoses, glue("{out_path}/codes_hes_diagnoses_220328.feather"))

### Procedures - Snomed CT

In [None]:
# just do opcs4 for now..., no good opcs3 mapping available => SnomedCT Mapping probably the most reasonable...

In [None]:
hesin_oper = fread(glue("{data_path}/hesin_oper.txt"))#fread("/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_oper.txt")

In [None]:
hesin_oper[hesin_oper == ""] <- NA

In [None]:
library(visdat)
df = hesin_oper %>% sample_n(1000)
vis_miss(df)

In [None]:
hesin_oper %>% ungroup() %>% sample_n(1000000) %>% group_by(is.na(opdate)) %>% tally()#summarise(mean = nrow(~is.na(opdate))/nrow())

In [None]:
hesin_oper_pre = hesin_oper %>% rename(date="opdate", code="oper4") %>% 
    mutate(date = ymd(as.Date(fast_strptime(date, "%d/%m/%Y"))))  %>%
    mutate(origin="hes_opcs4", instance=ins_index, code_origin=code) %>% group_by(eid) %>% mutate(n = arr_index) %>% select(eid, origin, instance, n, level, code_origin, date)

In [None]:
concept_ids_opcs4 = concept %>% filter(vocabulary_id == "OPCS4") %>% mutate(concept_code = str_replace(concept_code, "\\.", ""))
concept_ids_snomed = concept %>% filter(vocabulary_id == "SNOMED" & domain_id=="Procedure" & standard_concept %in% c("S", "C")) 

# check necessary opcs4 concept ids
concept_ids = concept_ids_opcs4 %>% mutate(concept_id_1 = concept_id)

cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_opcs4$concept_id) %>% filter(concept_id_2 %in% concept_ids_snomed$concept_id) %>% arrange(concept_id_1)

In [None]:
mapping_opcs4_snomed = concept_ids_opcs4 %>% 
    left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c("concept_id"="concept_id_1")) %>% 
    left_join(concept_ids_snomed %>% select(concept_id, concept_code, concept_name), by=c("concept_id_2"="concept_id")) %>% 
    mutate(code_origin = concept_code.x, code=concept_code.y, name=concept_name.y)

In [None]:
hes_procedures = hesin_oper_pre %>% left_join(mapping_opcs4_snomed %>% select(code_origin, code, name), by="code_origin") %>% 
    ungroup() %>% filter(!is.na(date)&!is.na(code)) %>% 
    mutate(vocabulary_origin="OPCS4", vocabulary="SNOMED", level_origin=level) %>% 
    select(eid, origin, vocabulary_origin, code_origin, level_origin, vocabulary, code,	name, date) %>%
    mutate(origin=factor(origin), vocabulary_origin=factor(vocabulary_origin), 
           code_origin=factor(code_origin), vocabulary=factor(vocabulary), 
           code=factor(code), name=factor(name))
nrow(hes_procedures)
hes_procedures %>% arrange(desc(date)) %>% head()#sample_n(5)

In [None]:
arrow::write_feather(hes_procedures, glue("{out_path}/codes_hes_procedures_220328.feather"))

In [None]:
test =  arrow::read_feather(glue("{out_path}/codes_hes_procedures_220317.feather"))
test %>% head()

## Mortality Records - ICD10

In [None]:
death = fread(glue("{data_path}/death.txt")) #fread("/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/death.txt")
death_cause =  fread(glue("{data_path}/death_cause.txt"))

In [None]:
death_join = death[death_cause, on=c("eid", "ins_index")]

In [None]:
death_join = death_join[, c("eid", "ins_index", "arr_index", "level", "date_of_death", "cause_icd10")][order(eid, ins_index, arr_index),]

In [None]:
death_join_date = death_join %>% rename(date="date_of_death") %>% rename(code = "cause_icd10") %>% mutate(date = ymd(as.Date(fast_strptime(date, "%d/%m/%Y"))))

In [None]:
codes_death = death_join_date  %>% mutate(instance=0) %>% mutate(origin="death_records") %>% group_by(eid) %>% mutate(n=row_number())
codes_death = codes_death %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date)) %>% ungroup()

In [None]:
concept_ids_in = concept %>% filter(vocabulary_id == "ICD10") %>% mutate(concept_code = str_replace(concept_code, "\\.", ""))
concept_ids_out = concept %>% filter(vocabulary_id == "SNOMED" & standard_concept %in% c("S", "C")) 

# check necessary opcs4 concept ids
concept_ids = concept_ids_in %>% mutate(concept_id_1 = concept_id)

cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_in$concept_id) %>% filter(concept_id_2 %in% concept_ids_out$concept_id) %>% arrange(concept_id_1)

In [None]:
mapping_out = concept_ids_in %>% 
    left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c("concept_id"="concept_id_1")) %>% 
    left_join(concept_ids_out %>% select(concept_id, concept_code, concept_name), by=c("concept_id_2"="concept_id")) %>% 
    mutate(code_origin = concept_code.x, code=concept_code.y, name=concept_name.y)

In [None]:
codes_death = codes_death %>% 
    mutate(vocabulary_origin="ICD10", code_origin=code, level_origin=level) %>%
    select(eid, origin, vocabulary_origin, code_origin, level_origin, date) %>%
    left_join(mapping_out %>% select(code_origin, code, name), by="code_origin") %>% 
    ungroup() %>% filter(!is.na(date)&!is.na(code)) %>% 
    mutate(vocabulary="SNOMED") %>%
    select(eid, origin, vocabulary_origin, code_origin, level_origin, vocabulary, code, name, date) %>%
    mutate(origin=factor(origin), vocabulary_origin=factor(vocabulary_origin), 
           code_origin=factor(code_origin), vocabulary=factor(vocabulary), 
           code=factor(code), name=factor(name))

In [None]:
nrow(codes_death)

In [None]:
head(codes_death %>% arrange(desc(date)))

In [None]:
arrow::write_feather(codes_death, glue("{out_path}/codes_death_records_220328.feather"))

## GP Records

In [None]:
library(dtplyr)
library(dplyr, warn.conflicts = FALSE)

In [None]:
gp_code_types = fread(glue("{mapping_path}/codings/coding3175.tsv"))
gp_code_type_map = gp_code_types$meaning
names(gp_code_type_map) = gp_code_types$coding

#### Clinical

In [None]:
## emis

In [None]:
gp_clinical_emis_raw = fread(
    glue("{data_path}/covid19_emis_gp_clinical.txt"), 
    colClasses = c("eid"="integer", 
                   "event_dt"="character", 
                   "code_type"="character", 
                   "code"="character",
                   "value"="character", 
                   "unit"="character"))

In [None]:
nrow(gp_clinical_emis_raw)
gp_clinical_emis_raw %>% sample(5)

In [None]:
gp_clinical_emis_raw = lazy_dt(gp_clinical_emis_raw)

In [None]:
gp_clinical_emis = gp_clinical_emis_raw %>%
    mutate(code_type=as.factor(code_type), code=as.factor(code)) %>%
    mutate(date = ymd(as.Date(fast_strptime(event_dt, "%d/%m/%Y")))) %>% 
    select(eid, code_type, code, date)

In [None]:
## tpp

In [None]:
gp_clinical_tpp_raw = fread(
    glue("{data_path}/covid19_tpp_gp_clinical.txt"), 
    colClasses = c("eid"="integer", 
                   "event_dt"="character", 
                   "code_type"="character", 
                   "code"="character",
                   "value"="character")
    )

In [None]:
nrow(gp_clinical_tpp_raw)
gp_clinical_tpp_raw %>% sample(5)

In [None]:
gp_clinical_tpp_raw = lazy_dt(gp_clinical_tpp_raw)

In [None]:
gp_clinical_tpp = gp_clinical_tpp_raw %>%
    mutate(code_type=as.factor(code_type), code=as.factor(code)) %>%
    mutate(date = ymd(as.Date(fast_strptime(event_dt, "%d/%m/%Y")))) %>% 
    select(eid, code_type, code, date)

In [None]:
gp_clinical = bind_rows(gp_clinical_emis %>% as.data.table(), 
                        gp_clinical_tpp %>% as.data.table()
                       )

In [None]:
gp_clinical$code_type = recode(gp_clinical$code_type, !!!gp_code_type_map)#%>% sample_n(100)

In [None]:
gp_clinical %>% head()

In [None]:
# write intermediate results
arrow::write_feather(gp_clinical, glue("{out_path}/codes_gp_clinical_raw_220317.feather"))

In [None]:
gp_clinical = gp_clinical %>% filter(code_type %in% c("SNOMED CT", "CTV3"))

In [None]:
# get snomed records
gp_clinical_sct = gp_clinical %>% 
    filter(code_type=="SNOMED CT") %>% 
    mutate(origin="gp_sct") %>% 
    mutate(vocabulary_origin="SNOMED", code_origin = code) %>%
    mutate(vocabulary="SNOMED") %>%
    select(eid, origin, vocabulary_origin, code_origin, vocabulary, code, date) %>%
    as.data.table()

In [None]:
# get and map ctv3 records
readv3_sct = fread(glue("{mapping_path}/gp_codings/CTV3SCTMAP.csv")) %>% 
    rename(SCUI="V1", STUI="V2", TCUI="V3", TTUI="V4")%>% rename(code_origin="SCUI", code="TCUI") %>% 
    select(code_origin, code)

In [None]:
gp_clinical_ctv3 = gp_clinical %>% 
    filter(code_type=="CTV3") %>% 
    mutate(origin="gp_ctv3") %>% 
    mutate(vocabulary_origin="CTV3", code_origin = code) %>%
    select(eid, origin, vocabulary_origin, code_origin, date) 

In [None]:
# map read3 records to snomed
gp_clinical_ctv3_sct = gp_clinical_ctv3 %>% 
    left_join(readv3_sct, by="code_origin") %>% 
    mutate(vocabulary="SNOMED") %>%
    select(eid, origin, vocabulary_origin, code_origin, vocabulary, code, date) %>%
    as.data.table()

In [None]:
gp_clinical_all = bind_rows(gp_clinical_sct, gp_clinical_ctv3_sct)

In [None]:
gp_clinical_all = gp_clinical_all %>% 
    mutate(origin = factor(origin),
           vocabulary_origin = factor(vocabulary_origin),
           code_origin = factor(code_origin),
           vocabulary = factor(vocabulary),
           code = factor(code)
          )

In [None]:
# clean impossible dates

In [None]:
gp_clinical_cleaned = gp_clinical_all %>% filter(date>="1902-02-02"&date<"2037-07-07") %>% as.data.table()

In [None]:
gp_clinical_cleaned %>% arrange(date) %>% head()

In [None]:
# write intermediate results
arrow::write_feather(gp_clinical_cleaned, glue("{out_path}/codes_gp_clinical_220317.feather"))

In [None]:
### prepare for later

In [None]:
gp_clinical_cleaned = arrow::read_feather(glue("{out_path}/codes_gp_clinical_220317.feather"))

In [None]:
gp_clinical_cleaned %>% head()

In [None]:
concept_join = concept %>% filter(vocabulary_id=="SNOMED") %>% select(concept_code, domain_id, concept_name) %>% mutate(concept_code = factor(concept_code))

In [None]:
test = gp_clinical_cleaned  %>% left_join(concept_join, by=c("code"="concept_code"))

In [None]:
test %>% head()

#### Prescriptions

In [None]:
gp_code_types = fread(glue("{mapping_path}/codings/coding3175.tsv"))
gp_code_type_map = gp_code_types$meaning
names(gp_code_type_map) = gp_code_types$coding

In [None]:
# EMIS

In [None]:
gp_emis_scripts = fread(glue("{data_path}/covid19_emis_gp_scripts.txt"))
nrow(gp_emis_scripts)

In [None]:
gp_emis_scripts$code_type = recode(gp_emis_scripts$code_type, !!!gp_code_type_map)#%>% sample_n(100)

In [None]:
gp_emis_scripts = gp_emis_scripts %>% mutate(date = ymd(as.Date(fast_strptime(issue_date, "%d/%m/%Y"))))

In [None]:
gp_emis_scripts = gp_emis_scripts %>% select(eid, code_type, code, date)

In [None]:
gp_emis_scripts %>% group_by(code_type) %>% as.data.frame() %>% tally()

In [None]:
gp_emis_scripts = gp_emis_scripts %>% filter(code_type=="dm+d") %>% 
    select(-code_type) %>% mutate(origin="gp_scripts_emis", vocabulary_origin="dm+d", code_origin=code) %>% 
    select(eid, origin, vocabulary_origin, code_origin, date)

In [None]:
gp_emis_scripts %>% head() 

In [None]:
# TPP

In [None]:
# tpp
gp_tpp_scripts = fread(glue("{data_path}/covid19_tpp_gp_scripts.txt"), 
                       colClasses=c("integer", "character", "character")) 
nrow(gp_tpp_scripts)

In [None]:
gp_tpp_scripts = gp_tpp_scripts %>% 
    mutate(date = ymd(as.Date(fast_strptime(issue_date, "%d/%m/%Y")))) %>% 
    mutate(origin="gp_scripts_tpp", vocabulary_origin="dm+d", code_origin=dmd_code)  %>%
    select(eid, origin, vocabulary_origin, code_origin, date)
nrow(gp_tpp_scripts)

In [None]:
gp_scripts = bind_rows(gp_emis_scripts, gp_tpp_scripts)

In [None]:
arrow::write_feather(gp_scripts, glue("{out_path}/codes_gp_scripts_raw_220317.feather"))

In [None]:
gp_scripts = arrow::read_feather(glue("{out_path}/codes_gp_scripts_raw_220317.feather"))

In [None]:
map_codes = function(concept_ids_in, concept_ids_out){
    concept_ids = concept_ids_in %>% mutate(concept_id_1 = concept_id)
    cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_in$concept_id) %>% filter(concept_id_2 %in% concept_ids_out$concept_id) %>% arrange(concept_id_1)
    
    mapping =  concept_ids_in %>% 
        left_join(cr_filtered %>% select(concept_id_1, relationship_id, concept_id_2), by=c("concept_id"="concept_id_1")) %>%
        left_join(concept_ids_out %>% select(concept_id, vocabulary_id, concept_code, concept_name, domain_id, concept_class_id), by=c("concept_id_2"="concept_id")) %>% 
        mutate(code_origin = concept_code.x, vocabulary=vocabulary_id.y, code=concept_code.y, name=concept_name.y, id_origin=concept_id_2, domain=domain_id.y, concept_class=concept_class_id.y) %>%
        select(code_origin, vocabulary, code, name, relationship_id, id_origin, domain, concept_class) %>% distinct() %>% filter(!is.na(id_origin))
    return(mapping)
    }

In [None]:
find_ancestors = function(mapping_in){
    mapping_anc = mapping_in %>% left_join(concept_ancestor, by=c("id_origin"="descendant_concept_id"))
    mapping_anc_md = mapping_anc %>% left_join(concept, by=c("ancestor_concept_id"="concept_id"))
    return(mapping_anc_md)
    }

In [None]:
find_descendant = function(mapping_in){
    mapping_desc = mapping_in %>% left_join(concept_ancestor, by=c("id_origin"="ancestor_concept_id"))
    mapping_desc_md = mapping_desc %>% left_join(concept, by=c("descendant_concept_id"="concept_id"))
    return(mapping_anc_md)
    }

##### Drugs to ingredients

In [None]:
concept_ids_in = concept %>% filter(vocabulary_id %in% c("dm+d", "SNOMED"))
concept_ids_out = concept %>% filter(vocabulary_id %in% c("RxNorm", "RxNorm Extension", "SNOMED"))  # & standard_concept %in% c("S", "C")) 

mapping_dmd_drugs = map_codes(concept_ids_in, concept_ids_out) %>% filter(domain=="Drug") 
mapping_drugs_ingredient = find_ancestors(mapping_dmd_drugs) %>% filter(concept_class_id=="Ingredient", vocabulary_id=="RxNorm", standard_concept %in% c("S", "C")) %>% arrange(code_origin)

In [None]:
mapping_drugs_ingredient_clean = mapping_drugs_ingredient %>% 
    mutate(code = factor(concept_code), vocabulary=factor(vocabulary_id), name=factor(concept_name)) %>%
    select(code_origin, vocabulary, code, name) %>% 
    distinct()

print(length(unique(mapping_drugs_ingredient_clean$code_origin)))
mapping_drugs_ingredient_clean %>% head()

##### Drugs to vaccines

In [None]:
concept_ids_in = concept %>% filter(vocabulary_id %in% c("SNOMED", "dm+d"))
concept_ids_out = concept %>% filter(vocabulary_id %in% c("RxNorm", "RxNorm Extension", "SNOMED", "CVX"))# & standard_concept %in% c("S", "C")) 

mapping_dmd_vaccine = map_codes(concept_ids_in, concept_ids_out) %>% filter(domain=="Drug") 
mapping_drugs_vaccine = find_ancestors(mapping_dmd_vaccine) %>% filter(vocabulary_id=="CVX", standard_concept %in% c("S", "C")) %>% arrange(code_origin)

In [None]:
mapping_drugs_vaccine_clean = mapping_drugs_vaccine %>% 
    mutate(code = factor(concept_code), vocabulary=factor(vocabulary_id), name=factor(concept_name)) %>%
    select(code_origin, vocabulary, code, name) %>% 
    distinct()

print(length(unique(mapping_drugs_vaccine_clean$code_origin)))
mapping_drugs_vaccine_clean %>% head()

In [None]:
mapping_drugs_vaccine_clean %>% head()

##### Drugs to devices

In [None]:
## prepare map

In [None]:
dmd_devices_vocab = concept %>% filter(vocabulary_id %in% c("dm+d")) %>% filter(domain_id=="Device")  %>% select(concept_code, concept_class_id, concept_name) %>%
    rename(dmd_concept_class_id = concept_class_id, dmd_concept_name = concept_name) %>%
    left_join(concept %>% filter(vocabulary_id=="SNOMED") , by="concept_code")

In [None]:
dmd_devices_vocab %>% head()

In [None]:
vmpps = (dmd_devices_vocab %>% filter(dmd_concept_class_id=='VMPP'))$concept_id
vmps =  (dmd_devices_vocab %>% filter(dmd_concept_class_id=='VMP'))$concept_id
ampps = (dmd_devices_vocab %>% filter(dmd_concept_class_id=='AMPP'))$concept_id
amps =  (dmd_devices_vocab %>% filter(dmd_concept_class_id=='AMP'))$concept_id

In [None]:
from_vmpp_to_vmp = concept_relationship %>% filter((concept_id_1 %in% vmpps) & (relationship_id == 'Has VMP')) 
from_vmpp_to_vmp %>% head()

In [None]:
from_amp_to_vmp = concept_relationship %>% filter((concept_id_1 %in% amps) & (relationship_id == 'Is a') & (concept_id_2 %in% vmps))
from_amp_to_vmp %>% head()

In [None]:
from_ampp_to_amp = concept_relationship %>% filter((concept_id_1 %in% ampps) & (relationship_id == 'Has AMP')) 
from_ampp_to_amp %>% head()

In [None]:
# prepare prescriptions for map

In [None]:
gp_scripts_devices = gp_scripts %>% group_by(code_origin) %>% summarise(count = n()) %>% arrange(desc(count))

In [None]:
gp_scripts_devices_md = gp_scripts_devices %>% left_join(dmd_devices_vocab %>% select(concept_code, dmd_concept_class_id, concept_id), by=c("code_origin"="concept_code")) %>% filter(!is.na(dmd_concept_class_id))

In [None]:
# only AMPs and VMPs exist
gp_scripts_devices_md %>% group_by(dmd_concept_class_id) %>% summarise(n=n(), n_sum=sum(count))

In [None]:
# map VMP directly to VMP
gp_scripts_vmp_to_vmp = gp_scripts_devices_md %>% filter(dmd_concept_class_id=="VMP") %>% left_join(concept, by="concept_id")
gp_scripts_vmp_to_vmp %>% head()

In [None]:
# map AMP to VMP
gp_scripts_amp_to_vmp = gp_scripts_devices_md %>% filter(dmd_concept_class_id=="AMP") %>% 
    rename(amp_concept_id = concept_id) %>% 
    left_join(from_amp_to_vmp, by=c("amp_concept_id" = "concept_id_1")) %>%
    rename(concept_id = concept_id_2) %>%
    select(code_origin, count, dmd_concept_class_id, amp_concept_id, concept_id) %>%
    left_join(concept, by="concept_id")
gp_scripts_amp_to_vmp %>% head()

In [None]:
mapping_devices_vmps_clean = bind_rows(gp_scripts_vmp_to_vmp, gp_scripts_amp_to_vmp) %>% #standard_concept %in% c("S", "C") %>%
    select(code_origin, count, concept_code, concept_name, vocabulary_id) %>% distinct() %>% arrange(desc(count)) %>%
    mutate(code = factor(concept_code), vocabulary=factor(vocabulary_id), name=factor(concept_name)) %>%
    select(code_origin, vocabulary, code, name) %>% 
    distinct()
    
mapping_devices_vmps_clean %>% head()

In [None]:
length(unique(mapping_dmd_devices$id_origin))

In [None]:
mapping_devices_anc = mapping_dmd_devices %>% left_join(concept_ancestor, by=c("id_origin"="descendant_concept_id"))
mapping_devices_anc_md = mapping_devices_anc %>% left_join(concept, by=c("ancestor_concept_id"="concept_id"))

In [None]:
mapping_devices_anc_md_clean = mapping_devices_anc_md %>% 
    filter(!concept_name %in% c("Device", 
                                "Physical object", 
                                "NHS dm+d virtual appliance pack", 
                                "NHS dm+d actual appliance pack", 
                                "UK device", 
                                "NHS dm+d virtual appliance", 
                                "NHS dm+d actual appliance", 
                                "Biomedical device",
                               "Clinical equipment and/or device"))

In [None]:
mapping_devices_anc_md_clean %>% group_by(concept_name) %>% tally() %>% arrange(desc(n))

In [None]:
mapping_drugs_ingredient = mapping_drugs_anc_md %>% filter(concept_class_id=="Ingredient", vocabulary_id=="RxNorm") %>% arrange(code_origin)

In [None]:
mapping_dmd_device_md

In [None]:
mapping_dmd_nondrugs_md %>% group_by(domain_id) %>% tally()

In [None]:
mapping_dmd_nondrugs_md %>% group_by(vocabulary_id) %>% tally()

##### Rest to Standards

In [None]:
mapping_successful = bind_rows(mapping_drugs_vaccine_clean, mapping_drugs_ingredient_clean, mapping_devices_vmps_clean) 
mapping_unsuccessful = gp_scripts %>% select(code_origin) %>% distinct() %>% left_join(mapping_successful) %>% filter(is.na(code))

In [None]:
gp_unmapped = gp_scripts %>% 
    left_join(mapping_unsuccessful %>% select(code_origin) %>% mutate(unmapped=1), by='code_origin') %>%
    select(eid, origin, vocabulary_origin, code_origin, date, unmapped) %>% filter(unmapped==1)

In [None]:
missing_md = gp_unmapped  %>% group_by(code_origin) %>% tally() %>% arrange(desc(n)) %>% 
    left_join(concept %>% mutate(concept_code = as.character(concept_code)), by=c("code_origin"="concept_code")) %>% 
    filter(vocabulary_id=="dm+d")# %>% 

##### Execute mapping to prescriptions

In [None]:
gp_mapped = gp_scripts %>% 
    left_join(mapping_successful, by='code_origin') %>%
    select(eid, origin, vocabulary_origin, code_origin, vocabulary, code, name, date) %>%
    mutate(origin = factor(origin),
           vocabulary_origin = factor(vocabulary_origin),
           code_origin = factor(code_origin),
           vocabulary = factor(vocabulary),
           code = factor(code),
            name = factor(name)
          ) %>%
    filter(date < "2022-03-28")

In [None]:
gp_mapped %>% head()

In [None]:
mapped_md = gp_mapped  %>% group_by(code) %>% tally() %>% arrange(desc(n)) %>% 
    left_join(concept %>% mutate(concept_code = as.character(concept_code)), by=c("code"="concept_code")) %>% 
    filter(vocabulary_id %in% c("SNOMED", "RxNorm", "CVX"))# %>% 

In [None]:
arrow::write_feather(gp_mapped, glue('{out_path}/codes_gp_presciptions_220407.feather'))