# Quality control comparison between LIBD and public RNA-seq data
- CMC
- GTEx brain regions

In [None]:
library(tidyverse)
library(synapser)
library(ggpubr)

## Load metrics

### Load Lieber RNA-seq metrics

In [None]:
libd = data.table::fread("/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv") %>%
    select(-starts_with("snp"), -c("antipsychotics", "lifetime_antipsych")) %>%
    rename("Percent_Aligned"="overallMapRate", "rRNA_Rate"="rRNA_rate", "Brain_Region"="Region", 
           "SampleID"="RNum")
libd %>% head(2)

In [None]:
libd_filter = libd %>% filter(!(Brain_Region == "HIPPO" & Protocol != "RiboZeroHMR"))
libd_filter %>% head(2)

### Load CMC RNA-seq metrics

In [None]:
# Process functions
downloadFile_version <- function(id , version){
  data.table::fread(synGet(id, version = version)$path, data.table = F)
}
synLogin()

# Download clinical metadata 
CLINICAL_ID = 'syn3354385'
clinical = downloadFile_version(CLINICAL_ID, version = 4)

# Download RNASeq metadata
METADATA_QC_DLPFC_ID = 'syn18358379' 
metadata = downloadFile_version(METADATA_QC_DLPFC_ID, version = 3)

# Join clinical and RNASeq metadata 
md = right_join(clinical, metadata, by = c("Individual ID" = "Individual_ID")) %>% 
    mutate(Dx = fct_recode(Dx, AFF_BP = "BP", AFF_BP = "AFF", Other = "undetermined", 
                           Control = "Control", SCZ = "SCZ"))

# Compute read pair metrics and add Institution-Dx variable
md <- md %>%
    select(c("SampleID", "Dx", "Reported Gender", "Sex", "Ethnicity", "pH", "Age of Death",
             "Study", "Brain_Region", "Brodmann_Area", "RIN", "rRNA_Rate", "Percent_Aligned"))
colnames(md) <- gsub(' ', '_', colnames(md))
print(dim(md))
md %>% head(2)

### Load GTEx metrics and 

In [None]:
gtex = data.table::fread(paste0("https://storage.googleapis.com/gtex_analysis_v8/annotations/",
                                "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")) %>%
    select(c("SAMPID", "SMRIN", "SMTS", "SMTSD", "SMMAPRT", "SMRRNART")) %>%
    filter(SMTS == "Brain") %>% 
    rename("RIN"="SMRIN", "Percent_Aligned"="SMMAPRT", "rRNA_Rate"="SMRRNART", "Brain_Region"="SMTSD", 
           "SampleID"="SAMPID")
gtex %>% head()

## Clean and organize data

In [None]:
gtex = gtex %>% mutate("Dataset"="GTEx", "Dx"="Control", 
                       "Brain_Region"=gsub("Brain - ", "", str_replace(gtex$Brain_Region, "\\(.*", ""))) %>% 
    select("SampleID", "Dx", "Brain_Region", "RIN", "rRNA_Rate", "Percent_Aligned", "Dataset")
gtex$Brain_Region = gsub(" $", "", str_replace(gtex$Brain_Region, "Frontal Cortex", "DLPFC"))
gtex %>% head(2)

In [None]:
cmc = md %>% select("SampleID", "Dx", "Brain_Region", "RIN", "rRNA_Rate", "Percent_Aligned") %>%
    mutate("Dataset"="CMC")
cmc %>% head(2)

In [None]:
libd = libd %>% select("SampleID", "Dx", "Brain_Region", "RIN", "rRNA_Rate", "Percent_Aligned") %>%
    mutate("Dataset"="LIBD", "Dx"=gsub("CTL", "Control", libd$Dx), 
           "Brain_Region"=gsub("HIPPO", "Hippocampus", libd$Brain_Region))
libd %>% head(2)

In [None]:
df = bind_rows(libd, gtex, cmc) #%>% mutate_if(is.character, as.factor)
df %>% head(2)

In [None]:
df$Brain_Region %>% unique

## Descriptive analysis

### Check if NA present

#### RIN

In [None]:
rin = df %>% group_by(Dataset, Brain_Region) %>% count(is.na(RIN)) %>%
    pivot_wider(names_from=`is.na(RIN)`, values_from="n") %>%
    mutate("TRUE"=replace_na(`TRUE`,0), `RIN (%na)`=`TRUE`/(`FALSE`+`TRUE`))
rin %>% head(2)

#### rRNA Rate

In [None]:
rrna = df %>% group_by(Dataset, Brain_Region) %>% count(is.na(rRNA_Rate)) %>%
    pivot_wider(names_from=`is.na(rRNA_Rate)`, values_from="n") %>%
    mutate("TRUE"=replace_na(`TRUE`,0), `rRNA Rate (%na)`=`TRUE`/(`FALSE`+`TRUE`))
rrna %>% head(2)

#### Percent Alignment

In [None]:
palign = df %>% group_by(Dataset, Brain_Region) %>% count(is.na(Percent_Aligned)) %>%
    pivot_wider(names_from=`is.na(Percent_Aligned)`, values_from="n") %>%
    mutate("TRUE"=replace_na(`TRUE`,0), `Percent Aligned (%na)`=`TRUE`/(`FALSE`+`TRUE`))
palign %>% head(2)

#### Combined summary

In [None]:
tbl = rin %>% select(-c(`FALSE`, `TRUE`)) %>%
    inner_join(select(rrna, -c(`FALSE`, `TRUE`)), by=c("Dataset", "Brain_Region")) %>%
    inner_join(select(palign, -c(`FALSE`, `TRUE`)), by=c("Dataset", "Brain_Region"))
tbl %>% as.data.frame %>% data.table::fwrite("qc_metric_NAs_libd_gtex_cmc.csv")
tbl

### Summarize data

In [None]:
tbl = df %>% group_by(Dataset, Brain_Region) %>%
    summarise(RIN_mean=mean(RIN, na.rm=TRUE), RIN_sd=sd(RIN, na.rm=TRUE), 
              rRNA_mean=mean(rRNA_Rate, na.rm=TRUE), rRNA_sd=sd(rRNA_Rate, na.rm=TRUE), 
              PercentAligned_mean=mean(Percent_Aligned, na.rm=TRUE), 
              PercentAligned_sd=sd(Percent_Aligned, na.rm=TRUE))
tbl %>% as.data.frame %>% data.table::fwrite("qc_metric_summary_libd_gtex_cmc.csv")
tbl

## Plot metrics

In [None]:
save_ggplots <- function(fn, p, w, h){
    for(ext in c('.pdf', '.png', '.svg')){
        ggsave(paste0(fn, ext), plot=p, width=w, height=h)
    }
}

In [None]:
set.seed(20210723)
bxp = df %>% ggboxplot(x="Brain_Region", y="RIN", color="Dataset", add='jitter', 
                       panel.labs.font=list(face='bold', size = 14), palette="npg", 
                       outlier.shape=NA, ylab='RIN', add.params=list(alpha=0.5), 
                       legend="bottom", xlab="Brain Region", 
                       ggtheme=theme_pubr(base_size=20)) +
    font("xy.title", face="bold") + font("legend.title", face="bold") + 
    rotate_x_text(45)
save_ggplots("boxplot_brain_region_comparison_rin", bxp, 12, 8)
bxp

In [None]:
set.seed(20210723)
bxp = df %>% ggboxplot(x="Brain_Region", y="rRNA_Rate", color="Dataset", add='jitter', 
                       panel.labs.font=list(face='bold', size = 14), palette="npg", 
                       outlier.shape=NA, ylab='rRNA Rate', add.params=list(alpha=0.5), 
                       legend="bottom", xlab="Brain Region", ggtheme=theme_pubr(base_size=20)) +
    font("xy.title", face="bold") + font("legend.title", face="bold") + rotate_x_text(45)
save_ggplots("boxplot_brain_region_comparison_rRNA_rate", bxp, 12, 8)
bxp

In [None]:
set.seed(20210723)
bxp = df %>% ggboxplot(x="Brain_Region", y="Percent_Aligned", color="Dataset", add='jitter', 
                       panel.labs.font=list(face='bold', size = 14), palette="npg", 
                       outlier.shape=NA, ylab='Percent Aligned', add.params=list(alpha=0.5), 
                       legend="bottom", xlab="Brain Region", ggtheme=theme_pubr(base_size=20)) +
    font("xy.title", face="bold") + font("legend.title", face="bold") + rotate_x_text(45)
save_ggplots("boxplot_brain_region_comparison_Percent_Aligned", bxp, 12, 8)
bxp

## Session Info

In [None]:
Sys.time()
proc.time()
options(width = 120)
sessioninfo::session_info()