# Examine tissue specific genes for correlation with gene expression or cell type proportion

In [None]:
library(dplyr)

## Functions

In [None]:
get_tpm <- function(){
    cc_file = paste0("/ceph/projects/v4_phase3_paper/inputs/counts/",
                     "text_files_counts/tpm/_m/caudate/gene/log2tpm.csv")
    dd_file = paste0("/ceph/projects/v4_phase3_paper/inputs/counts/",
                     "text_files_counts/tpm/_m/dlpfc/gene/log2tpm.csv")
    hh_file = paste0("/ceph/projects/v4_phase3_paper/inputs/counts/",
                     "text_files_counts/tpm/_m/hippocampus/gene/log2tpm.csv")
    cc = data.table::fread(cc_file) %>% tibble::column_to_rownames("names") %>%
        t %>% as.data.frame %>% tibble::rownames_to_column("RNum")
    dd = data.table::fread(dd_file) %>% tibble::column_to_rownames("names") %>%
        t %>% as.data.frame %>% tibble::rownames_to_column("RNum")
    hh = data.table::fread(hh_file) %>% tibble::column_to_rownames("names") %>%
        t %>% as.data.frame %>% tibble::rownames_to_column("RNum")
    return(bind_rows(cc, hh, dd))
}
memTPM <- memoise::memoise(get_tpm)

get_pheno <- function(){
    filename = "/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv"
    df = data.table::fread(filename) %>% 
        filter(Age > 13, Race %in% c("AA", "EA"), Dx %in% c("CTL", "SZ"))
    return(df)
}
memPHENO <- memoise::memoise(get_pheno)

# Extract tissue specific eGenes

In [None]:
eFeature = data.table::fread("../../_m/genes/significant_geneSNP_pairs_3tissues.tsv") %>%
    filter(N_Regions_Shared == 1) %>% select(-N_Regions_Shared)
eFeature %>% head(2)

## Prepare data

In [None]:
df = memPHENO() %>% inner_join(memTPM(), by="RNum")
df %>% dim

## Linear model for expression and brain region

In [None]:
pvals = c(); genes = c()
for(gene_id in eFeature$gene_id){
    model = paste(paste0(gene_id, "~ Region*Sex"), "Dx + Age + mitoRate + rRNA_rate",
                  "overallMapRate + RIN + ERCCsumLogErr + totalAssignedGene + snpPC1",
                  "snpPC2 + snpPC3", sep=" + ")
    fitted = anova(lm(model, data=df))
    #fit_lm = aov(lm(model, data=df))
    pvals = c(pvals, fitted["Region", "Pr(>F)"])
    genes = c(genes, gene_id)
}
pval_df = data.frame("gene_id"=genes, "p_values"=pvals)
print(sum(pvals > 0.05))
pval_df %>% head(2)

## Comparison of expression

In [None]:
dt = df %>% select(Region, all_of(eFeature$gene_id)) %>% 
    aggregate(. ~ Region, ., mean) %>% 
    mutate(Region = gsub("HIPPO", "Hippocampus", Region)) %>%
    tibble::column_to_rownames("Region") %>%
    t %>% as.data.frame %>% tibble::rownames_to_column("gene_id") %>%
    inner_join(eFeature, by="gene_id", suffix=c("_Expression", "_eQTL")) %>% 
    select(-c("variant_id")) %>% inner_join(pval_df, by="gene_id")
tt = dt %>% select(ends_with("Expression"))
dt = dt %>% mutate("Max Expression"=gsub("_Expression", "", colnames(tt)[apply(tt, 1, which.max)]), 
                   "Min Expression"=gsub("_Expression", "", colnames(tt)[apply(tt, 1, which.min)]), 
                   "Mean Expresion"=rowMeans(tt),
                   "Ratio (DLPFC / Caudate)" = DLPFC_Expression/Caudate_Expression, 
                   "Ratio (Hippocampus / Caudate)" = Hippocampus_Expression/Caudate_Expression, 
                   "Ratio (Hippocampus / DLPFC)" = Hippocampus_Expression/DLPFC_Expression)
dt %>% data.table::fwrite("eQTL_regionSpecific_summary.tsv", sep='\t')
dt %>% head(2)

In [None]:
sum(dt$`Ratio (DLPFC / Caudate)` > 0.9)

In [None]:
sum(dt$`Ratio (Hippocampus / Caudate)` > 0.9)

In [None]:
nochange = sum(dt$`Ratio (DLPFC / Caudate)` > 0.9) + sum(dt$`Ratio (Hippocampus / Caudate)` > 0.9)
print(nochange)
nochange / dim(eFeature)[1]

In [None]:
sum(dt$`Ratio (Hippocampus / DLPFC)` > 0.9)

In [None]:
## Low expression genes
sum(dt$`Mean Expresion` < 1)
sum(dt$`Mean Expresion` < 1) / dim(eFeature)[1]

In [None]:
sum(dt$Caudate_eQTL == 1 & dt$`Max Expression` == "Caudate")
sum(dt$Caudate_eQTL == 1 & dt$`Max Expression` == "Caudate" & 
    (dt$`Ratio (DLPFC / Caudate)` < 0.9 | dt$`Ratio (Hippocampus / Caudate)` < 0.9))
sum(dt$Caudate_eQTL == 1 & dt$`Max Expression` == "Caudate" & 
    (dt$`Ratio (DLPFC / Caudate)` < 0.9 | dt$`Ratio (Hippocampus / Caudate)` < 0.9)) / dim(eFeature)[1]

In [None]:
sum(dt$DLPFC_eQTL == 1 & dt$`Max Expression` == "DLPFC")
sum(dt$Hippocampus_eQTL == 1 & dt$`Max Expression` == "Hippocampus")

In [None]:
sum(dt$Caudate_eQTL == 1 & dt$`Min Expression` == "Caudate")
sum(dt$DLPFC_eQTL == 1 & dt$`Min Expression` == "DLPFC")
sum(dt$Hippocampus_eQTL == 1 & dt$`Min Expression` == "Hippocampus")

In [None]:
sum(eFeature$Caudate == 1)
sum(eFeature$DLPFC == 1)
sum(eFeature$Hippocampus == 1)

### Summary
- All specific genes are caudate, and caudate has the highest expression!

## Reproducibility information

In [None]:
Sys.time()
proc.time()
options(width=120)
sessioninfo::session_info()