In [1]:
####

### libraries
library(data.table)
library(SummarizedExperiment)
library(jaffelab)
library(MatrixEQTL)
library(sva)

Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which, which.max, which.min

Loading required package: S4Vectors

Atta

In [2]:
read_snps = function(filename)
{
    snp0 = fread(filename, sep="\t", nrows = 2)
    ncols = dim(snp0)[2] -1 
    snp0 = fread(filename, sep="\t",  colClasses=c("str", rep("double", ncols)))
    r = as.matrix(snp0, rownames = "snp")
    rm(snp0)
    gc()
    return(r)
}

filename = "/ceph/projects/v3_phase3_paper/inputs/genotypes/to_brnum/merge/to_plink/mds/to_rnum_dlpfc_hippo/_m/rnum_snps.tsv"

snp = read_snps(filename)


In [3]:
head(snp)

Unnamed: 0,R13243,R12300,R13244,R3050,R13245,R12301,R13246,R12302,R13247,R3670,⋯,R12295,R12296,R3585,R13241,R12297,R12298,R12873,R4179,R13242,R12299
GA018352,0,0,1,1,,,0,0,0,0,⋯,0,1,0,0,0,0,,,0,0
rs3748592,0,0,1,1,,,0,0,0,0,⋯,0,1,0,0,0,0,,,0,0
rs2340582,0,0,1,1,,,0,0,0,0,⋯,0,1,0,0,0,0,,,0,0
rs4246503,0,0,1,1,,,0,0,0,0,⋯,0,1,0,0,0,0,,,0,0
rs4970376:885699:A:G,0,0,1,1,,,0,0,0,0,⋯,0,1,0,0,0,0,,,0,0
rs4970375:886006:T:C,0,0,1,1,,,0,0,0,0,⋯,0,1,0,0,0,0,,,0,0


In [4]:
## Helper functions for loading and combining data

load_gene = function(filename)
{
    load(filename)
    
    colData(rse_gene) = colData(rse_gene)[,c('BrNum', 'RNum', 'Region', 'Dx', 'Age', 'Sex', 'Race')]
    rowData(rse_gene)$meanExprs = NULL
    return(rse_gene)
}


load_exon = function(filename)
{
    load(filename)
    
    colData(rse_exon) = colData(rse_exon)[,c('BrNum', 'RNum', 'Region', 'Dx', 'Age', 'Sex', 'Race')]
    rowData(rse_exon)$meanExprs = NULL
    return(rse_exon)
}


load_tx = function(filename)
{
    load(filename)
    
    colData(rse_tx) = colData(rse_tx)[,c('BrNum', 'RNum', 'Region', 'Dx', 'Age', 'Sex', 'Race')]
    rowData(rse_tx)$meanExprs = NULL
    return(rse_tx)
}


load_jxn = function(filename)
{
    load(filename)
    
    colData(rse_jxn) = colData(rse_jxn)[,c('BrNum', 'RNum', 'Region', 'Dx', 'Age', 'Sex', 'Race')]
    rowData(rse_jxn)$meanExprs = NULL
    return(rse_jxn)
}


common_rse = function(x, y)
{
    common_rows = intersect(rownames(x), rownames(y))
    
    new_row_ranges = rowRanges(x)[common_rows,]
    new_colData = rbind(colData(x), colData(y))
    
    
    common_assays = intersect(names(assays(x)), names(assays(y)))
    
    new_assays = SimpleList()
    
    
    for( nn in common_assays )
    {
        new_assay = SimpleList(cbind(assays(x)[[nn]][common_rows,], assays(y)[[nn]][common_rows,]))
        new_assays = c(new_assays, new_assay)
    }
    
    names(new_assays) = common_assays
    
    SummarizedExperiment(rowRanges = new_row_ranges,
                         colData = new_colData, 
                         assays = new_assays)
}

In [5]:
rse_gene = common_rse(load_gene("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/dlpfc_ribozero_brainseq_phase2_hg38_rseGene_merged_n453.rda"),
                     load_gene("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/hippo_brainseq_phase2_hg38_rseGene_merged_n447.rda"))
rse_gene

class: RangedSummarizedExperiment 
dim: 58037 917 
metadata(0):
assays(1): counts
rownames(58037): ENSG00000223972.5 ENSG00000227232.5 ...
  ENSG00000210195.2 ENSG00000210196.2
rowData names(9): Length gencodeID ... NumTx gencodeTx
colnames(917): R12864 R12865 ... R6578 R6579
colData names(7): BrNum RNum ... Sex Race

In [6]:
rse_exon = common_rse(load_exon("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/dlpfc_ribozero_brainseq_phase2_hg38_rseExon_merged_n453.rda"),
                     load_exon("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/hippo_brainseq_phase2_hg38_rseExon_merged_n447.rda"))
rse_exon

class: RangedSummarizedExperiment 
dim: 571623 917 
metadata(0):
assays(1): counts
rownames(571623): e1 e2 ... e1182764 e1182765
rowData names(9): Length gencodeID ... NumTx gencodeTx
colnames(917): R12864 R12865 ... R6578 R6579
colData names(7): BrNum RNum ... Sex Race

In [7]:
rse_tx = common_rse(load_tx("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/dlpfc_ribozero_brainseq_phase2_hg38_rseTx_merged_n453.rda"),
                    load_tx("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/hippo_brainseq_phase2_hg38_rseTx_merged_n447.rda"))
rse_tx

class: RangedSummarizedExperiment 
dim: 198093 917 
metadata(0):
assays(1): tpm
rownames(198093): ENST00000456328.2 ENST00000450305.2 ...
  ENST00000387460.2 ENST00000387461.2
rowData names(22): source type ... protein_id ccdsid
colnames(917): R12864 R12865 ... R6578 R6579
colData names(7): BrNum RNum ... Sex Race

In [8]:
rse_jxn = common_rse(load_jxn("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/dlpfc_ribozero_brainseq_phase2_hg38_rseJxn_merged_n453.rda"),
                     load_jxn("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/hippo_brainseq_phase2_hg38_rseJxn_merged_n447.rda"))
rse_jxn

class: RangedSummarizedExperiment 
dim: 675265 917 
metadata(0):
assays(1): counts
rownames(675265): chr1:11672-12009(+) chr1:13115-183634(+) ...
  chrY:57202021-57203181(-) chrY:57202146-57203181(-)
rowData names(15): inGencode inGencodeStart ... newGeneSymbol isFusion
colnames(917): R12864 R12865 ... R6578 R6579
colData names(7): BrNum RNum ... Sex Race

In [9]:
## filter, based on brainseq phase2
tmp_gene_rpkm = log2(recount::getRPKM(rse_gene, "Length") + 1)
rse_gene = rse_gene[rowMeans(tmp_gene_rpkm) > 0.2,]
rm(tmp_gene_rpkm)

tmp_exon_rpkm = log2(recount::getRPKM(rse_exon, "Length") + 1)
rse_exon = rse_exon[rowMeans(tmp_exon_rpkm) > 0.2,]
rm(tmp_exon_rpkm)

rowRanges(rse_jxn)$Length <- 100
jRp10m = recount::getRPKM(rse_jxn, 'Length')
rse_jxn = rse_jxn[rowMeans(jRp10m) > 0.4,]

rse_tx = rse_tx[rowMeans(assays(rse_tx)$tpm) > 0.4,]

## keep adult samples & correct region
keepInd = which((colData(rse_gene)$Age > 13) & (colData(rse_gene)$Dx %in% c('Schizo', 'Control')))
rse_gene = rse_gene[,keepInd]
rse_exon = rse_exon[,keepInd]
rse_jxn = rse_jxn[,keepInd]
rse_tx = rse_tx[,keepInd]

## extract pd and rpkms
pd = colData(rse_gene)
geneRpkm = recount::getRPKM(rse_gene, "Length")
exonRpkm = recount::getRPKM(rse_exon, "Length")
jxnRp10m = recount::getRPKM(rse_jxn, 'Length')
txTpm = assays(rse_tx)$tpm

# save expression
geneExpression = log2(geneRpkm+1)
exonExpression = log2(exonRpkm+1)
jxnExpression = log2(jxnRp10m+1)
txExpression = log2(txTpm+1)

save(geneExpression, exonExpression, jxnExpression, txExpression, file="expression.rda")



Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang


# SNPs

In [10]:
mds = as.data.frame(fread("/ceph/projects/v3_phase3_paper/inputs/genotypes/to_brnum/merge/to_plink/mds/to_rnum_dlpfc_hippo/_m/rnum_snp_mds.tsv",
                       sep="\t"))
row.names(mds) = mds$RNum
mds$RNum = NULL

In [24]:
snpMap = as.data.frame(fread("/ceph/projects/v3_phase3_paper/inputs/genotypes/to_brnum/merge/to_plink/mds/to_rnum_dlpfc_hippo/_m/rnum_snp_map.tsv",
                       sep="\t"))
row.names(snpMap) = snpMap$rn
snpMap$rn = NULL

In [25]:
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,22676427,1211.1,47282130,2525.2,47282130,2525.2
Vcells,9476314071,72298.6,22092243809,168550.5,22079956820,168456.8


In [30]:
######################
### snp data ####
######################



### make mds and snp dimensions equal to N
###(repeat rows or columns for BrNum replicates)
mds = mds[pd$RNum,]
#snp = snp[,pd$RNum]

## drop SNPs not mapping to hg38
keepIndex = which( (!is.na(snpMap$chr_hg38)) & (!is.na(snpMap$pos_hg38)) )
snpMap = snpMap[keepIndex,]
snp = snp[keepIndex, pd$RNum]

In [33]:
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,22687433,1211.7,47282130,2525.2,47282130,2525.2
Vcells,14845358370,113261.1,22092243809,168550.5,22079956820,168456.8


In [15]:
######################
# statistical model ##
######################

pd$Dx = factor(pd$Dx,levels = c("Control", "Schizo"))

mod = model.matrix(~ Sex + as.matrix(mds[,1:5]) + Dx + Region, data = pd)


colnames(mod)[3:7] = colnames(mds)[1:5]

In [16]:
head(mod)

Unnamed: 0,(Intercept),SexM,snpPC1,snpPC2,snpPC3,snpPC4,snpPC5,DxSchizo,RegionDLPFC
R12864,1,0,-0.0595023,0.000751113,-0.00681555,-0.00632985,0.00357488,1,0
R12865,1,1,-0.0764475,0.00286208,-0.00427266,-0.000210235,-0.00213126,1,0
R12866,1,0,-0.0851877,0.000579168,-0.00506443,-0.000479038,-0.00271798,1,0
R12867,1,1,-0.0878766,0.00247181,-0.00236993,-0.000732675,0.000321071,1,0
R12868,1,1,-0.0580806,-0.000323263,-0.00401917,-0.0038735,-0.00698546,1,0
R12869,1,0,-0.0444091,0.0027448,-0.00763513,-0.00550436,-0.000797735,1,0


In [17]:
######################
# create SNP objects #
######################

theSnps = SlicedData$new(as.matrix(snp))
theSnps$ResliceCombined(sliceSize = 50000)

snpspos = snpMap[,c("SNP","chr_hg38","pos_hg38")]
colnames(snpspos) = c("name","chr","pos")

In [23]:
gc()

In [18]:
#######################
####### do PCA ########
#######################

pcaGene = prcomp(t(log2(geneRpkm+1)))
kGene = num.sv(log2(geneRpkm+1), mod)
genePCs = pcaGene$x[,1:kGene]

pcaExon = prcomp(t(log2(exonRpkm+1)))
kExon = num.sv(log2(exonRpkm+1), mod, vfilter=50000)
exonPCs = pcaExon$x[,1:kExon]

pcaJxn = prcomp(t(log2(jxnRp10m+1)))
kJxn = num.sv(log2(jxnRp10m+1), mod, vfilter=50000)
jxnPCs = pcaJxn$x[,1:kJxn]

pcaTx = prcomp(t(log2(txTpm+1)))
kTx = num.sv(log2(txTpm+1), mod, vfilter=50000)
txPCs = pcaTx$x[,1:kTx]


dir.create('pca_rdas')
save(genePCs, exonPCs, jxnPCs, txPCs, 
     file="pca_rdas/pcs_4features_filtered_over13.rda")

In [None]:
##################
### Covariates ###
##################

load("pca_rdas/pcs_4features_filtered_over13.rda")

covsGene0 = t(cbind(genePCs, mod[,-1]))
covsExon0 = t(cbind(exonPCs, mod[,-1]))
covsJxn0 = t(cbind(jxnPCs, mod[,-1]))
covsTx0 = t(cbind(txPCs, mod[,-1]))

save(covsGene0, covsExon0, covsJxn0, covsTx0, file="covariates.rda")

covsGene = SlicedData$new(covsGene0)
covsExon = SlicedData$new(covsExon0)
covsJxn = SlicedData$new(covsJxn0)
covsTx = SlicedData$new(covsTx0)


In [None]:
##########################
### feature annotation ###
##########################

###### gene level
posGene = as.data.frame(rowRanges(rse_gene))[,1:3]
posGene$name = rownames(posGene)
posGene = posGene[,c(4,1:3)]

##### exon level 
posExon = as.data.frame(rowRanges(rse_exon))[,1:3]
posExon$name = rownames(posExon)
posExon = posExon[,c(4,1:3)]

##### junction level 
posJxn = as.data.frame(rowRanges(rse_jxn))[,1:3]
posJxn$name = rownames(posJxn)
posJxn = posJxn[,c(4,1:3)]
names(posJxn)[2:4] = c("Chr", "Start","End")

##### transcript level 
posTx = as.data.frame(rowRanges(rse_tx))[,1:3]
posTx$name = rownames(posTx)
posTx = posTx[,c(4,1:3)]
names(posTx)[2:4] = c("Chr", "Start","End")


#############################
### sliced expression data ##
geneSlice = SlicedData$new(geneExpression)
exonSlice = SlicedData$new(exonExpression)
jxnSlice = SlicedData$new(jxnExpression)
txSlice = SlicedData$new(txExpression)

geneSlice$ResliceCombined(sliceSize = 5000)
exonSlice$ResliceCombined(sliceSize = 5000)
jxnSlice$ResliceCombined(sliceSize = 5000)
txSlice$ResliceCombined(sliceSize = 5000)

In [None]:
gc()

In [None]:
##########################
### Run EQTLs ############
##########################
print("Begin eQTL analysis")

meGene = Matrix_eQTL_main(snps=theSnps, gene = geneSlice, 
    cvrt = covsGene, output_file_name.cis =  "cis_eqtls_genes.ctxt" ,
    pvOutputThreshold.cis = 0.01,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posGene, 
    useModel = modelLINEAR_CROSS, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)	

meExon = Matrix_eQTL_main(snps=theSnps, gene = exonSlice, 
    cvrt = covsExon, output_file_name.cis =  "cis_eqtl_exons.ctxt" ,
    pvOutputThreshold.cis = 0.01,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posExon, 
    useModel = modelLINEAR_CROSS, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)

meJxn = Matrix_eQTL_main(snps=theSnps, gene = jxnSlice, 
    cvrt = covsJxn, output_file_name.cis =  "cis_eqtl_junctions.ctxt" ,
    pvOutputThreshold.cis = 0.01,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posJxn, 
    useModel = modelLINEAR_CROSS, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)	

meTx = Matrix_eQTL_main(snps=theSnps, gene = txSlice, 
    cvrt = covsTx, output_file_name.cis =  "cis_eqtl_trasncripts.ctxt" ,
    pvOutputThreshold.cis = 0.01,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posTx, 
    useModel = modelLINEAR_CROSS, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)	

dir.create('eqtl_tables')
save(meGene, meExon, meJxn, meTx,
     file="eqtl_tables/matrixEqtl_output_4features_p01.rda")


In [None]:
######################
###### annotate ######

load("eqtl_tables/matrixEqtl_output_4features_p01.rda")

# extract
geneEqtl = meGene$cis$eqtls
geneEqtl$gene = as.character(geneEqtl$gene)
geneEqtl$snps = as.character(geneEqtl$snps)

exonEqtl = meExon$cis$eqtls
exonEqtl$gene = as.character(exonEqtl$gene)
exonEqtl$snps = as.character(exonEqtl$snps)

jxnEqtl = meJxn$cis$eqtls
jxnEqtl$gene = as.character(jxnEqtl$gene)
jxnEqtl$snps = as.character(jxnEqtl$snps)

txEqtl = meTx$cis$eqtls
txEqtl$gene = as.character(txEqtl$gene)
txEqtl$snps = as.character(txEqtl$snps)

################################
# add gene annotation info #####
################################

geneEqtl$Symbol = rowRanges(rse_gene)$Symbol[match(geneEqtl$gene, rownames(rse_gene))]
geneEqtl$EnsemblGeneID = rowRanges(rse_gene)$ensemblID[match(geneEqtl$gene, rownames(rse_gene))]
geneEqtl$Type = "Gene"
geneEqtl$Class = "InGen"
geneEqtl = DataFrame(geneEqtl)
# geneEqtl$gene_type = rowRanges(rse_gene)$gene_type[match(geneEqtl$gene, rownames(rse_gene))]

exonEqtl$Symbol = rowRanges(rse_exon)$Symbol[match(exonEqtl$gene, rownames(rse_exon))]
exonEqtl$EnsemblGeneID = rowRanges(rse_exon)$ensemblID[match(exonEqtl$gene, rownames(rse_exon))]
exonEqtl$Type = "Exon"
exonEqtl$Class = "InGen"
exonEqtl = DataFrame(exonEqtl)
# exonEqtl$gene_type = rowRanges(rse_exon)$gene_type[match(exonEqtl$gene, rownames(rse_exon))]

jxnEqtl$Symbol = rowRanges(rse_jxn)$newGeneSymbol[match(jxnEqtl$gene, rownames(rse_jxn))]
jxnEqtl$EnsemblGeneID = rowRanges(rse_jxn)$newGeneID[match(jxnEqtl$gene, rownames(rse_jxn))]
jxnEqtl$Type = "Jxn"
jxnEqtl$Class = rowRanges(rse_jxn)$Class[match(jxnEqtl$gene, rownames(rse_jxn))]
jxnEqtl = DataFrame(jxnEqtl)
# jxnEqtl$gene_type = rowRanges(rse_jxn)$gene_type[match(jxnEqtl$gene, rownames(rse_jxn))]

txEqtl$Symbol = rowRanges(rse_tx)$gene_name[match(txEqtl$gene, rownames(rse_tx))]
txEqtl$EnsemblGeneID = ss(rowRanges(rse_tx)$gene_id[match(txEqtl$gene, rownames(rse_tx))],"\\.",1)
txEqtl$Type = "Tx"
txEqtl$Class = "InGen"
txEqtl = DataFrame(txEqtl)
# txEqtl$gene_type = rowRanges(rse_tx)$gene_type[match(txEqtl$gene, rownames(rse_tx))]


# merge
allEqtl = rbind(geneEqtl, exonEqtl, jxnEqtl, txEqtl)
allEqtl$gencodeTx = CharacterList(c(as.list(rowRanges(rse_gene)$gencodeTx[match(geneEqtl$gene, 
    rownames(rse_gene))]),
    as.list(rowRanges(rse_exon)$gencodeTx[match(exonEqtl$gene, rownames(rse_exon))]),
    as.list(rowRanges(rse_jxn)$gencodeTx[match(jxnEqtl$gene, rownames(rse_jxn))]),
    as.list(txEqtl$gene)))

## add snp rs number
allEqtl$snpRsNum = snpMap$name[match(allEqtl$snps, snpMap$SNP)]

##geneEqtl$snpRsNum = snpMap$name[match(geneEqtl$snps, snpMap$SNP)]


## save
save(allEqtl, file="eqtl_tables/mergedEqtl_output_4features.rda",compress=TRUE)

## significance filter
allEqtl_signif = allEqtl[allEqtl$FDR < 0.05,]
save(allEqtl_signif, file="eqtl_tables/mergedEqtl_output_4features_FDR05.rda",compress=TRUE)

allEqtl_signif = allEqtl[allEqtl$FDR < 0.01,]
save(allEqtl_signif, file="eqtl_tables/mergedEqtl_output_4features_FDR01.rda",compress=TRUE)

##save(geneEqtl, file="eqtl_tables/gene_eqtl.rda")
##geneEqtl_signif = geneEqtl[geneEqtl$FDR < 0.05,]
##save(geneEqtl_signif, file="eqtl_tables/gene_eqtl_fdr05.rda")
