# Benchmarks

## Initialize

In [None]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

In [None]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

experiment = 220627
experiment_path = glue("{output_path}/{experiment}")

In [None]:
concept = fread("/sc-projects/sc-proj-ukb-cvd/data/mapping/athena/CONCEPT.csv")

## Generate Data

In [None]:
records = arrow::read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/final_records_omop_220531.feather")	

In [None]:
records_omop = records %>% 
    filter(vocabulary=="OMOP") %>% 
    mutate(date = as.Date(date)) %>%
    filter(between(date, "1903-03-04", "2022-07-19")) %>% 
    arrange(date)

In [None]:
temp = records_omop %>% 
    mutate(origin_simple = case_when(
        stringr::str_detect(origin, "gp_") ~ factor("GP", levels=c("GP", "HES", "Death")), 
        stringr::str_detect(origin, "hes_") ~ factor("HES", levels=c("GP", "HES", "Death")), 
        stringr::str_detect(origin, "death_") ~ factor("Death", levels=c("GP", "HES", "Death"))
    )
) 

In [None]:
nrow(temp)

In [None]:
10000000

In [None]:
temp_plot = temp %>% sample_n(round(nrow(temp)/100))

In [None]:
base_size = 8
title_size = 10
facet_size = 8.5
geom_text_size=3
theme_set(theme_classic(base_size = base_size) + 
          theme(strip.background = element_blank(), plot.title=element_text(size=title_size, hjust=0), 
                strip.text.x = element_text(size = facet_size),axis.title=element_text(size=10), axis.text=element_text(size=8, color="black"),
                legend.position="bottom", axis.line = element_line(size = 0.2), axis.ticks=element_line(size=0.2)))

In [None]:
library(wesanderson)

In [None]:
options(repr.plot.width=4, repr.plot.height=3, repr.plot.dpi=320)
origin = ggplot(temp_plot %>% filter(between(date, "1980-01-01", "2022-07-19")), aes(x=date, fill=origin_simple, color=origin_simple)) + 
    labs(y="Number of Records", x=NULL) + 
    geom_area(aes(y = ..count..*100), stat = "bin", alpha=0.7, binwidth=200) +
    #geom_histogram(bins=200, alpha=0.3) +  
    scale_color_manual(values= wes_palette("Darjeeling1", n = 3), name="Source") +
    scale_fill_manual(values= wes_palette("Darjeeling1", n = 3), name="Source") +
    scale_x_date(expand=c(0, 0))+scale_y_continuous(expand=c(0, 0), labels = scales::unit_format(unit = "M", scale = 1e-6))+
    theme(legend.position=c(0.2, 0.7))
origin

In [None]:
options(repr.plot.width=4, repr.plot.height=3, repr.plot.dpi=320)
domain = ggplot(temp_plot %>% 
                    filter(between(date, "1980-01-01", "2022-07-19")) %>%
                    mutate(domain_id = factor(domain_id, levels=c("Condition", "Procedure", "Drug", "Observation", "Device"))),
                aes(x=date, fill=domain_id, color=domain_id)) + 
     labs(y="Number of Records", x=NULL) + 
    geom_area(aes(y = ..count..*100), stat = "bin", alpha=0.7, binwidth=200) + 
    scale_color_manual(values= wes_palette("Zissou1", n = 5), name="Domain") +
    scale_fill_manual(values= wes_palette("Zissou1", n = 5), name="Domain") +
    scale_x_date(expand=c(0, 0))+scale_y_continuous(expand=c(0, 0), labels = scales::unit_format(unit = "M", scale = 1e-6))+
    theme(legend.position=c(0.2, 0.7))
domain

In [None]:
plot_width=8.25; plot_height=3; plot_res=320
options(repr.plot.width=plot_width, repr.plot.height=plot_height, repr.plot.dpi=plot_res)
records_plot = origin|domain 
records_plot

In [None]:
library(gt)
plot_name = "SupplFigure1a_recordstime"
records_plot %>% ggsave(filename=glue("outputs/{plot_name}.pdf"), device=cairo_pdf, width=plot_width, height=plot_height, dpi=plot_res)