# Examine tissue specific genes for correlation with gene expression or cell type proportion

In [1]:
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




## Functions

In [2]:
get_tpm <- function(){
    cc_file = paste0("/ceph/projects/v4_phase3_paper/inputs/counts/",
                     "text_files_counts/tpm/_m/caudate/gene/log2tpm.csv")
    dd_file = paste0("/ceph/projects/v4_phase3_paper/inputs/counts/",
                     "text_files_counts/tpm/_m/dlpfc/gene/log2tpm.csv")
    hh_file = paste0("/ceph/projects/v4_phase3_paper/inputs/counts/",
                     "text_files_counts/tpm/_m/hippocampus/gene/log2tpm.csv")
    cc = data.table::fread(cc_file) %>% tibble::column_to_rownames("names") %>%
        t %>% as.data.frame %>% tibble::rownames_to_column("RNum")
    dd = data.table::fread(dd_file) %>% tibble::column_to_rownames("names") %>%
        t %>% as.data.frame %>% tibble::rownames_to_column("RNum")
    hh = data.table::fread(hh_file) %>% tibble::column_to_rownames("names") %>%
        t %>% as.data.frame %>% tibble::rownames_to_column("RNum")
    return(bind_rows(cc, hh, dd))
}
memTPM <- memoise::memoise(get_tpm)

get_pheno <- function(){
    filename = "/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv"
    df = data.table::fread(filename) %>% 
        filter(Age > 13, Race %in% c("AA", "EA"), Dx %in% c("CTL", "SZ"))
    return(df)
}
memPHENO <- memoise::memoise(get_pheno)

# Extract tissue specific eGenes

In [3]:
eFeature = data.table::fread("../../_m/genes/significant_geneSNP_pairs_3tissues.tsv") %>%
    filter(N_Regions_Shared == 1) %>% select(-N_Regions_Shared)
eFeature %>% head(2)

gene_id,variant_id,Caudate,DLPFC,Hippocampus
<chr>,<chr>,<int>,<int>,<int>
ENSG00000012660.13,chr6:53353585:T:G,1,0,0
ENSG00000033122.18,chr1:69737254:C:T,1,0,0


## Prepare data

In [4]:
df = memPHENO() %>% inner_join(memTPM(), by="RNum")
df %>% dim

## Linear model for expression and brain region

In [5]:
pvals = c(); genes = c()
for(gene_id in eFeature$gene_id){
    model = paste(paste0(gene_id, "~ Region*Sex"), "Dx + Age + mitoRate + rRNA_rate",
                  "overallMapRate + RIN + ERCCsumLogErr + totalAssignedGene + snpPC1",
                  "snpPC2 + snpPC3", sep=" + ")
    fitted = anova(lm(model, data=df))
    #fit_lm = aov(lm(model, data=df))
    pvals = c(pvals, fitted["Region", "Pr(>F)"])
    genes = c(genes, gene_id)
}
pval_df = data.frame("gene_id"=genes, "p_values"=pvals)
print(sum(pvals > 0.05))
pval_df %>% head(2)

[1] 0


Unnamed: 0_level_0,gene_id,p_values
Unnamed: 0_level_1,<chr>,<dbl>
1,ENSG00000012660.13,0
2,ENSG00000033122.18,0


## Comparison of expression

In [6]:
dt = df %>% select(Region, all_of(eFeature$gene_id)) %>% 
    aggregate(. ~ Region, ., mean) %>% 
    mutate(Region = gsub("HIPPO", "Hippocampus", Region)) %>%
    tibble::column_to_rownames("Region") %>%
    t %>% as.data.frame %>% tibble::rownames_to_column("gene_id") %>%
    inner_join(eFeature, by="gene_id", suffix=c("_Expression", "_eQTL")) %>% 
    select(-c("variant_id")) %>% inner_join(pval_df, by="gene_id")
tt = dt %>% select(ends_with("Expression"))
dt = dt %>% mutate("Max Expression"=gsub("_Expression", "", colnames(tt)[apply(tt, 1, which.max)]), 
                   "Min Expression"=gsub("_Expression", "", colnames(tt)[apply(tt, 1, which.min)]), 
                   "Mean Expresion"=rowMeans(tt),
                   "Ratio (DLPFC / Caudate)" = DLPFC_Expression/Caudate_Expression, 
                   "Ratio (Hippocampus / Caudate)" = Hippocampus_Expression/Caudate_Expression, 
                   "Ratio (Hippocampus / DLPFC)" = Hippocampus_Expression/DLPFC_Expression)
dt %>% data.table::fwrite("eQTL_regionSpecific_summary.tsv", sep='\t')
dt %>% head(2)

Unnamed: 0_level_0,gene_id,Caudate_Expression,DLPFC_Expression,Hippocampus_Expression,Caudate_eQTL,DLPFC_eQTL,Hippocampus_eQTL,p_values,Max Expression,Min Expression,Mean Expresion,Ratio (DLPFC / Caudate),Ratio (Hippocampus / Caudate),Ratio (Hippocampus / DLPFC)
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,ENSG00000012660.13,6.499194,4.850471,4.085723,1,0,0,0,Caudate,Hippocampus,5.145129,0.7463188,0.6286508,0.8423354
2,ENSG00000033122.18,6.660252,4.930057,4.567698,1,0,0,0,Caudate,Hippocampus,5.386002,0.7402209,0.6858146,0.9265


In [7]:
sum(dt$`Ratio (DLPFC / Caudate)` > 0.9)

In [8]:
sum(dt$`Ratio (Hippocampus / Caudate)` > 0.9)

In [9]:
nochange = sum(dt$`Ratio (DLPFC / Caudate)` > 0.9) + sum(dt$`Ratio (Hippocampus / Caudate)` > 0.9)
print(nochange)
nochange / dim(eFeature)[1]

[1] 1


In [10]:
sum(dt$`Ratio (Hippocampus / DLPFC)` > 0.9)

In [11]:
## Low expression genes
sum(dt$`Mean Expresion` < 1)
sum(dt$`Mean Expresion` < 1) / dim(eFeature)[1]

In [12]:
sum(dt$Caudate_eQTL == 1 & dt$`Max Expression` == "Caudate")
sum(dt$Caudate_eQTL == 1 & dt$`Max Expression` == "Caudate" & 
    (dt$`Ratio (DLPFC / Caudate)` < 0.9 | dt$`Ratio (Hippocampus / Caudate)` < 0.9))
sum(dt$Caudate_eQTL == 1 & dt$`Max Expression` == "Caudate" & 
    (dt$`Ratio (DLPFC / Caudate)` < 0.9 | dt$`Ratio (Hippocampus / Caudate)` < 0.9)) / dim(eFeature)[1]

In [13]:
sum(dt$DLPFC_eQTL == 1 & dt$`Max Expression` == "DLPFC")
sum(dt$Hippocampus_eQTL == 1 & dt$`Max Expression` == "Hippocampus")

In [14]:
sum(dt$Caudate_eQTL == 1 & dt$`Min Expression` == "Caudate")
sum(dt$DLPFC_eQTL == 1 & dt$`Min Expression` == "DLPFC")
sum(dt$Hippocampus_eQTL == 1 & dt$`Min Expression` == "Hippocampus")

In [15]:
sum(eFeature$Caudate == 1)
sum(eFeature$DLPFC == 1)
sum(eFeature$Hippocampus == 1)

### Summary
- All specific genes are caudate, and caudate has the highest expression!

## Reproducibility information

In [16]:
Sys.time()
proc.time()
options(width=120)
sessioninfo::session_info()

[1] "2021-08-15 21:48:20 EDT"

   user  system elapsed 
 37.458  11.976  47.673 

─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 4.0.3 (2020-10-10)
 os       Arch Linux                  
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       America/New_York            
 date     2021-08-15                  

─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
 package     * version date       lib source        
 assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.0.2)
 base64enc     0.1-3   2015-07-28 [1] CRAN (R 4.0.2)
 cachem        1.0.5   2021-05-15 [1] CRAN (R 4.0.3)
 cli           3.0.0   2021-06-30 [1] CRAN (R 4.0.3)
 crayon        1.4.1   2021-02-08 [1] CRAN (R 4.0.3)
 data.table    1.14.0  2021-02-21 [1] CRAN (R 4.0.3)
