In [1]:
##################
### libraries ####
##################

suppressMessages({library(data.table)
                  library(MatrixEQTL)
                  library(reticulate)
                  library(dplyr)})

In [2]:
## Read parquet file using python
use_python("/usr/bin/python")
pandas <- import("pandas")

read_parquet <- function(path, columns = NULL){
    path = path.expand(path)
    path = normalizePath(path)
    if(!is.null(columns)) columns = as.list(columns)
    xdf = pandas$read_parquet(path, engine='fastparquet', columns=columns)
    xdf = tibble::rownames_to_column(xdf, var='Name')
    return(tibble::as_tibble(xdf))
}

## getRPKM based on recount @ https://github.com/leekgroup/recount
## Example include using Summary Object:jRp10m = recount::getRPKM(rse_jxn, 'Length')
getRPKM <- function(counts, length_var) {
    mapped <- colSums(counts)
    bg <- matrix(mapped, ncol = ncol(counts), nrow = nrow(counts), byrow = TRUE)
    len <- length_var
    wid <- matrix(len, nrow = nrow(counts), ncol = ncol(counts), byrow = FALSE)
    counts / (wid/1000) / (bg/1e6)
}

In [3]:
######################
# statistical model ##
######################

cov_file = paste0("/ceph/projects/v3_phase3_paper/inputs/gtex_v8/covariates/",
                  "_m/GTEx_Analysis_v8_eQTL_covariates/",
                  "Brain_Caudate_basal_ganglia.v8.covariates.txt")
cov <- fread(cov_file, data.table=FALSE)
rownames(cov) <- cov[, 1]
cov <- cov[, -1]
cov <- as.data.frame(t(cov))

dropList = c('GTEX-11UD1', 'GTEX-12ZZW', 'GTEX-139TS', 'GTEX-13PLJ', 'GTEX-13PVQ',
             'GTEX-1477Z', 'GTEX-17MF6', 'GTEX-1EH9U', 'GTEX-1F75B', 'GTEX-1RNSC',
             'GTEX-QDT8', 'GTEX-X261', 'GTEX-ZV68')
cov = cov[!(rownames(cov) %in% dropList), ]

mod = model.matrix(~pcr + platform + sex + PC1 + PC2 + PC3 + PC4 + PC5, data=cov) 

#######################
#### extract PCA ######
#######################

PCs = cov[, grep("InferredCov*", colnames(cov))]

##################
### Covariates ###
##################

covs = t(cbind(mod[,-1],PCs))
save(covs, file="covariates.rda")

covs = SlicedData$new(covs)

In [4]:
##################
### load pheno ###
##################

pheno_file <- paste0("/ceph/projects/v3_phase3_paper/inputs/",
                     "gtex_v8/phenotypes/_m/gtex_v8_sample_data.txt")
pd <- fread(pheno_file, data.table=FALSE)

In [5]:
######################
### load expression ##
######################

basename = "/ceph/projects/v3_phase3_paper/inputs/gtex_v8/expression"
gene_file = paste0(basename,"/_m/genes_gtex_v8_counts.txt.gz")
tx_file = paste0(basename,"/_m/transcripts_gtex_v8_tpm.txt.gz")
jxn_file = paste0(basename,"/_m/junctions_gtex_v8_count.txt.gz")
exon_file = "/ceph/tmp/tmp_gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet"

## read files
rse_gene = fread(gene_file, data.table=FALSE)
rse_jxn = fread(jxn_file, data.table=FALSE)
rse_tx = fread(tx_file, data.table=FALSE)
rse_exon = read_parquet(exon_file)

gene_annot = rse_gene[, 1:2]
exon_annot = rse_exon[, 1:2]
jxn_annot = rse_jxn[, 1:2]
tx_annot = rse_tx[, 1:2]

## Subset and keep caudate samples
rse_gene = rse_gene[, (colnames(rse_gene) %in% c('Name',pd$SAMPID))]
rse_exon = rse_exon[, (colnames(rse_exon) %in% c('Name', pd$SAMPID))]
rse_jxn = rse_jxn[, (colnames(rse_jxn) %in% c('Name', pd$SAMPID))]
rse_tx = rse_tx[, (colnames(rse_tx) %in% c('transcript_id', pd$SAMPID))]

## Remove duplicates
rse_gene = rse_gene[!(duplicated(rse_gene$Name)), ]
rse_exon = rse_exon[!(duplicated(rse_exon$Name)), ]
rse_jxn = rse_jxn[!(duplicated(rse_jxn$Name)), ]
rse_tx = rse_tx[!(duplicated(rse_tx$transcript_id)), ]

## Add rownames
if(tibble::has_rownames(rse_gene)){
    rse_gene = tibble::remove_rownames(rse_gene)
} 
rse_gene = tibble::column_to_rownames(rse_gene, var="Name")
if(tibble::has_rownames(rse_exon)){
    rse_exon = tibble::remove_rownames(rse_exon)
}
rse_exon = tibble::column_to_rownames(rse_exon, var="Name")
if(tibble::has_rownames(rse_jxn)){
    rse_jxn = tibble::remove_rownames(rse_jxn)
}
rse_jxn = tibble::column_to_rownames(rse_jxn, var="Name")
if(tibble::has_rownames(rse_tx)){
    rse_tx = tibble::remove_rownames(rse_tx)
}
rse_tx = tibble::column_to_rownames(rse_tx, var="transcript_id")

## filter, based on brainseq phase2
rse_gene = rse_gene[rowMeans(x=rse_gene, na.rm = TRUE) > 0.2,]
rse_exon = rse_exon[rowMeans(x=rse_exon, na.rm = TRUE) > 0.2,]
rse_jxn = rse_jxn[rowMeans(x=rse_jxn, na.rm = TRUE) > 0.4,]
rse_tx = rse_tx[rowMeans(x=rse_tx, na.rm = TRUE) > 0.4,]

## Feature Length
gene_len = fread(paste0('/ceph/projects/v3_phase3_paper/inputs/gtex_v8/',
                        'annotation/feature_lengths/_m/gene_annotation.txt'), 
                 data.table=F)
exon_len = fread(paste0("/ceph/projects/v3_phase3_paper/inputs/gtex_v8/", 
                        "annotation/feature_lengths/_m/exon_annotation.txt"), 
                 data.table=F)
gene_len = gene_len[(gene_len$gene_id %in% rownames(rse_gene)), ]
exon_len = exon_len[(exon_len$exon_id %in% rownames(rse_exon)), ]

## extract rpkms
geneRpkm = getRPKM(rse_gene, gene_len$length)
exonRpkm = getRPKM(rse_exon, exon_len$length)
jxnRp10m = getRPKM(rse_jxn, rep(100, dim(rse_jxn)[1]))
txTpm = rse_tx #transcripts are in TPM

## Normalize expression
geneExpression = log2(geneRpkm+1)
exonExpression = log2(exonRpkm+1)
jxnExpression = log2(jxnRp10m+1)
txExpression = log2(txTpm+1)

## Rename sample by SUBJID
colnames(geneExpression) <- pd$SUBJID[match(colnames(geneExpression), pd$SAMPID)]
colnames(exonExpression) <- pd$SUBJID[match(colnames(exonExpression), pd$SAMPID)]
colnames(jxnExpression) <- pd$SUBJID[match(colnames(jxnExpression), pd$SAMPID)]
colnames(txExpression) <- pd$SUBJID[match(colnames(txExpression), pd$SAMPID)]

## Match sample ids with covs
geneExpression = geneExpression[,(colnames(geneExpression) %in% colnames(covs))]
exonExpression = exonExpression[,(colnames(exonExpression) %in% colnames(covs))]
jxnExpression = jxnExpression[,(colnames(jxnExpression) %in% colnames(covs))]
txExpression = txExpression[,(colnames(txExpression) %in% colnames(covs))]

## Check colnames match
all((colnames(geneExpression) == colnames(covs)) & 
    (colnames(exonExpression) == colnames(covs)) & 
    (colnames(jxnExpression) == colnames(covs)) & 
    (colnames(txExpression) == colnames(covs)))

## save expression
save(geneExpression, exonExpression, jxnExpression, txExpression, file="expression.rda")
#load("expression.rda")

In [6]:
######################
### snp data ####
######################

snp_file <- paste0("/ceph/projects/v3_phase3_paper/inputs/gtex_v8/",
                   "genotypes_chr11/subset_caudate/a_transpose/",
                   "_m/gtex_v8_chr11.traw")
snp <- fread(snp_file, data.table=FALSE)
rownames(snp) <- snp$SNP
colnames(snp) <- gsub('_.*', '', colnames(snp))

snpMap <- snp[,c("SNP", "CHR", "POS")]
snp_annot <- snp[, 1:6]
snp <- snp[, -1:-6]

## Match samples
snp = snp[, (colnames(snp) %in% colnames(covs))]
all(colnames(snp) == colnames(covs))

######################
# create SNP objects #
######################

theSnps = SlicedData$new(as.matrix(snp))
theSnps$ResliceCombined(sliceSize = 50000)

snpspos = snpMap[,c("SNP","CHR","POS")]
colnames(snpspos) = c("name","chr","pos")

In [7]:
basename = "/ceph/projects/v3_phase3_paper/inputs/gtex_v8/expression"
gene_file = paste0(basename,"/_m/genes_gtex_v8_counts.txt.gz")
tx_file = paste0(basename,"/_m/transcripts_gtex_v8_tpm.txt.gz")
jxn_file = paste0(basename,"/_m/junctions_gtex_v8_count.txt.gz")
exon_file = "/ceph/tmp/tmp_gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet"

## read files
rse_gene = fread(gene_file, data.table=FALSE)
rse_jxn = fread(jxn_file, data.table=FALSE)
rse_tx = fread(tx_file, data.table=FALSE)
rse_exon = read_parquet(exon_file)

gene_annot = rse_gene[, 1:2]
exon_annot = rse_exon[, 1:2]
jxn_annot = rse_jxn[, 1:2]
tx_annot = rse_tx[, 1:2]

In [8]:
###################
### Subset DRD2 ###
###################

drd2_gene = subset(gene_annot, Description=='DRD2')
drd2_exon = subset(exon_annot, Description=='DRD2')
drd2_jxn = subset(jxn_annot, Description=='ENSG00000149295.13')
drd2_tx = subset(tx_annot, gene_id=='ENSG00000149295.13')

##### DRD2 region
drd2_upstream = gene_len[grepl('DRD2', gene_len$gene_name), ]$start - .5e6
drd2_downstream = gene_len[grepl('DRD2', gene_len$gene_name), ]$end + .5e6

In [9]:
##########################
### feature annotation ###
##########################

##### gene level
posGene = gene_len[gene_len$chr == 'chr11', c('gene_id', 'chr', 'start', 'end')]
posGene$chr = as.integer(gsub("chr", "", posGene$chr))

##### transcript level
tx_len = fread(paste0('/ceph/projects/v3_phase3_paper/inputs/gtex_v8/',
                      'annotation/feature_lengths/_m/transcript_annotation.txt'),
               data.table=FALSE)
posTx = tx_len[tx_len$chr == 'chr11', c('transcript_id', 'chr', 'start', 'end')]
posTx$chr = as.integer(gsub("chr", "", posTx$chr))

##### exon level
posExon = exon_len[exon_len$chr == 'chr11', c('exon_id', 'chr', 'start', 'end')]
posExon$chr = as.integer(gsub("chr", "", posExon$chr))

##### junction level 
jxnPos = as.data.frame(matrix(unlist(strsplit(rownames(jxnExpression), "_")), 
                              ncol=3, byrow=TRUE))
names(jxnPos) = c("chr", "start","end")
jxnPos$name = rownames(jxnExpression)
jxnPos$start = as.integer(as.character(jxnPos$start))
jxnPos$end = as.integer(as.character(jxnPos$end))
posJxn = jxnPos[jxnPos$chr == 'chr11', ]
posJxn = posJxn[, c(4,1:3)]
posJxn = subset(posJxn, !(chr %in% c('chrX', 'chrY', 'chrM')))
posJxn$chr = as.integer(as.character(gsub("chr", "", posJxn$chr)))

In [10]:
#############################
### sliced expression data ##
geneSlice = SlicedData$new(as.matrix(geneExpression))
exonSlice = SlicedData$new(as.matrix(exonExpression))
jxnSlice = SlicedData$new(as.matrix(jxnExpression))
txSlice = SlicedData$new(as.matrix(txExpression))

geneSlice$ResliceCombined(sliceSize = 5000)
exonSlice$ResliceCombined(sliceSize = 5000)
jxnSlice$ResliceCombined(sliceSize = 5000)
txSlice$ResliceCombined(sliceSize = 5000)

In [11]:
##########################
### Run EQTLs ############
##########################
print("Begin eQTL analysis")

meGene = Matrix_eQTL_main(snps=theSnps, gene = geneSlice, 
    cvrt = covs, output_file_name.cis =  "cis_eqtls_genes.ctxt" ,
    pvOutputThreshold.cis = 1,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posGene, 
    useModel = modelLINEAR, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)	

meExon = Matrix_eQTL_main(snps=theSnps, gene = exonSlice, 
    cvrt = covs, output_file_name.cis =  "cis_eqtl_exons.ctxt" ,
    pvOutputThreshold.cis = 1,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posExon, 
    useModel = modelLINEAR, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)

meJxn = Matrix_eQTL_main(snps=theSnps, gene = jxnSlice, 
    cvrt = covs, output_file_name.cis =  "cis_eqtl_junctions.ctxt" ,
    pvOutputThreshold.cis = 1,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posJxn, 
    useModel = modelLINEAR, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)	

meTx = Matrix_eQTL_main(snps=theSnps, gene = txSlice, 
    cvrt = covs, output_file_name.cis =  "cis_eqtl_trasncripts.ctxt" ,
    pvOutputThreshold.cis = 1,  pvOutputThreshold=0,
    snpspos = snpspos, genepos = posTx, 
    useModel = modelLINEAR, cisDist=5e5,
    pvalue.hist = 100,min.pv.by.genesnp = TRUE)	

ifelse(!dir.exists("eqtl_tables"), dir.create("eqtl_tables"), FALSE)
save(meGene, meExon, meJxn, meTx,
     file="eqtl_tables/matrixEqtl_output_gtex_caudate_4features.rda")

[1] "Begin eQTL analysis"


Matching data files and location files

1 of 37987 genes matched

3128903 of 3128903 SNPs matched


Task finished in 1.52 seconds

Reordering genes

Task finished in 3.672 seconds

Processing covariates

Task finished in 0.025 seconds

Processing gene expression data (imputation, residualization)

Task finished in 0.435 seconds

Creating output file(s)

Task finished in 0.039 seconds

Performing eQTL analysis

82.73% done, 36 cis-eQTLs

84.32% done, 95 cis-eQTLs

85.91% done, 95 cis-eQTLs

87.50% done, 95 cis-eQTLs

89.08% done, 95 cis-eQTLs

90.67% done, 95 cis-eQTLs

92.26% done, 95 cis-eQTLs

93.84% done, 95 cis-eQTLs

95.43% done, 95 cis-eQTLs

97.02% done, 95 cis-eQTLs

98.61% done, 95 cis-eQTLs

Task finished in 257.908 seconds



Matching data files and location files

10 of 245418 genes matched

3128903 of 3128903 SNPs matched


Task finished in 1.151 seconds

Reordering genes

Task finished in 7.105 seconds

Processing covariates

Task finished in 0.008 seconds

Processing gen