-
Notifications
You must be signed in to change notification settings - Fork 1
/
02_coloc_pheWAS.R
314 lines (245 loc) · 12 KB
/
02_coloc_pheWAS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
rm(list=ls())
## get the arguments from the command line
args <- commandArgs(trailingOnly=T)
##options
options(stringsAsFactors = F)
## --> import parameters <-- ##
#olink = name of the Olink protein
#chr.s = chromosome of the cis-region
#pos.s = start position of the cis-region
#pos.e = end position of the cis-region
olink <- args[1]
chr.s <- as.numeric(args[2])
pos.s <- as.numeric(args[3])
pos.e <- as.numeric(args[4])
#------------------------------------------------#
##-- import protein GWAS data --##
#------------------------------------------------#
## read the relevant data
res.olink <- paste0("zcat ~/insert_path/",
olink,"/", olink,"GWAS.txt.gz",
" | awk -v chr=", chr.s, " -v low=", pos.s, " -v upp=", pos.e,
" '{if(($2 == chr && $3 >= low && $3 <= upp) || NR==1) print $0}' -")
## import
res.olink <- data.table::fread(cmd = res.olink, sep = "\t", header = T, data.table = F)
## create MAF variable
res.olink$MAF <- ifelse(res.olink$Freq1 > .5, 1 - res.olink$Freq1, res.olink$Freq1)
## create new MarkerName
res.olink$snp.id <- apply(res.olink, 1, function(x){
paste0("chr", as.numeric(x[2]), ":", as.numeric(x[3]), "_", paste(toupper(sort(x[4:5])), collapse = "_"))
})
## define break criteria here (min p-value!)
p.min <- min(as.numeric(res.olink$Pvalue), na.rm = T)
#-----------------------------------------#
##-- only do if suggestive evidence --##
#-----------------------------------------#
if(p.min < 1e-6){
#-------------------------------------------------------------#
##-- import the indivudal level genotype data --##
#-------------------------------------------------------------#
## write file to obtain SNP dosages
write.table(res.olink$MarkerName, paste("tmpdir/tmp", olink, chr.s, pos.s, pos.e, "lst", sep="."), row.names = F, col.names = F, quote = F)
## import function to do so
source("scripts/import_snp_dosages.R")
## import
ld <- get.ld.info(olink, chr.s, pos.s, pos.e)
## ease downstream coding
snp.dat <- ld[[1]]
snp.info <- ld[[2]]
## delete and clean
rm(ld); gc(); gc()
## delete file no longer needed
system(paste("rm tmpdir/tmp", olink, chr.s, pos.s, pos.e, "dosage", sep="."))
## add rsid to Olink file
res.olink <- merge(res.olink, snp.info[, c("snp.id", "rsid")])
#-----------------------------------------#
##-- define top SNP and get proxies --##
#-----------------------------------------#
## identify top SNP by Z-score
top.snp <- res.olink$snp.id[which.max(abs(res.olink$Effect/res.olink$StdErr))]
## get proxies (use snp.id)
proxy.snps <- cor(snp.dat[, snp.info$id[which(snp.info$snp.id == top.snp)]], snp.dat[, snp.info$id])^2
proxy.snps <- data.frame(id=colnames(proxy.snps), R2=t(proxy.snps))
proxy.snps <- subset(proxy.snps, R2 >= .8)
## ease merging
proxy.snps <- merge(proxy.snps, snp.info)
## ensure overlap with summary stats
gc()
#-----------------------------------------#
##-- perform PheWAS for all proxies --##
#-----------------------------------------#
## load package to for API to https://gwas.mrcieu.ac.uk/
require(ieugwasr)
## query everything (omit FinnGen and BBJ as sample sizes are missing)
tmp.phewas <- phewas(variants=proxy.snps$rsid, pval=1e-6, batch=c("ebi-a", "ieu-a", "ieu-b", "met-a", "ubm-a", "ukb-b"))
## drop phenotypes with missing information on sample size
tmp.phewas$n <- as.numeric(tmp.phewas$n)
tmp.phewas <- subset(tmp.phewas, !is.na(n))
## define break criteria here as well
if(nrow(tmp.phewas)>0){
## to ease some downstream analysis
require(data.table)
## take the strongest association for each phenotype
tmp <- as.data.table(tmp.phewas[order(tmp.phewas$trait, abs(tmp.phewas$beta/tmp.phewas$se), decreasing = T),])
tmp[, ind:=1:.N, by="trait"]
tmp <- subset(tmp, ind == 1)
tmp$ind <- NULL
## test whether any remaining phenotypes may exist
if(nrow(tmp)>0){
tmp2 <- subset(tmp.phewas, rsid %in% proxy.snps$rsid & !(trait %in% tmp$trait))
}else{
tmp2 <- subset(tmp.phewas, rsid %in% proxy.snps$rsid)
}
## do iterative subsetting to keep unqie entries only
if(nrow(tmp2)>1){
## take the strongest association for each phenotype
tmp2 <- as.data.table(tmp2[order(tmp2$trait, abs(tmp2$beta/tmp2$se), decreasing = T),])
tmp2[, ind:=1:.N, by="trait"]
tmp2 <- subset(tmp2, ind == 1)
tmp2$ind <- NULL
}
## combine both
tmp <- rbind(tmp, tmp2)
## add SNP information
tmp <- merge(proxy.snps[, c("rsid", "id", "snp.id", "R2")], tmp, by="rsid", suffixes = c(".snp", ".ieu"))
## rename and add lead SNP
tmp$snp.id.lead <- top.snp
tmp$rsid.lead <- snp.info$rsid[which(snp.info$snp.id == top.snp)]
## add protein stats
phewas.results <- merge(res.olink, tmp, by=c("snp.id", "chr"), suffix=c(".pQTL", ".trait"))
#-----------------------------------------#
##-- run coloc for all results --##
#-----------------------------------------#
require(coloc)
res.coloc <- lapply(1:nrow(phewas.results), function(x){
## obtain summary stats and trait info
reg <- paste0(chr.s, ":", pos.s,"-", pos.e)
res.trait <- associations(reg, phewas.results$id.ieu[x])
## just to make sure everything will work fine
res.trait <- subset(res.trait, !is.na(beta))
## make unique, since some data processing error
res.trait <- unique(res.trait)
## get meta information (careful some might miss those)
tr.info <- tryCatch(
{
gwasinfo(phewas.results$id.ieu[x])
}, error=function(e){
return(NA)
})
## merge (this might ignore INDELS)
res.all <- merge(res.olink, res.trait, by="rsid")
## remove non-biallelelic variants
ii <- table(res.all$rsid)
res.all <- subset(res.all, rsid %in% names(ii[ii==1]))
## account for possible INDELs
res.all[, c("ea", "nea")] <- t(apply(res.all[, c("ea", "nea")], 1, function(x){
if(nchar(x[1]) > 1 | nchar(x[2]) > 1){
## replace
if(nchar(x[1]) > nchar(x[2])){
return(c("I", "D"))
}else{
return(c("D", "I"))
}
}else{
return(x)
}
}))
## align effect estimates
res.all$beta.aligned <- ifelse(toupper(res.all$Allele1) == res.all$ea, res.all$beta, -res.all$beta)
#-----------------------------------------#
##-- sanity check --##
#-----------------------------------------#
## top signal for Olink in overlap
it <- res.all$snp.id[which.max(abs(res.all$Effect/res.all$StdErr))]
## get the top SNP for the outcome in overlap only
io <- res.all$snp.id[which.max(abs(res.all$beta/res.all$se))]
## keep names
isnps <- sapply(c(top.snp, it, io), function(x) snp.info$id[which(snp.info$snp.id == x)])
## LD for check
it <- cor(snp.dat[, isnps[1]], snp.dat[, isnps[2]])^2
## LD within
is <- cor(snp.dat[, isnps[2]], snp.dat[, isnps[3]])^2
## LD top
io <- cor(snp.dat[, isnps[1]], snp.dat[, isnps[3]])^2
#-----------------------------------------#
##-- run coloc --##
#-----------------------------------------#
## add to ease mapping of LD matrix
res.all <- merge(res.all, snp.info[, c("snp.id", "id")], by="snp.id", suffix=c(".trait", ".snp"))
## order by position
res.all <- res.all[order(res.all$pos),]
## prepare input
D1 <- list(beta=res.all$Effect, varbeta=res.all$StdErr^2,
type="quant",
N=max(res.all$TotalSampleSize, na.rm=T),
sdY=1,
MAF=res.all$MAF,
snp=res.all$id.snp,
position=1:nrow(res.all))
## try out
if(!("ncase" %in% names(tr.info))){
print("use quantitative")
D2 <- list(beta=res.all$beta.aligned, varbeta=res.all$se^2,
type="quant",
# N=tr.info$sample_size,
N=max(res.all$n, na.rm=T),
sdY=1,
MAF=res.all$MAF,
snp=res.all$id.snp,
position=1:nrow(res.all))
}else{
## binary outcome
D2 <- list(beta=res.all$beta.aligned, varbeta=res.all$se^2,
type="cc",
s=tr.info$ncase/(tr.info$ncontrol+tr.info$ncase),
N=tr.info$sample_size,
MAF=res.all$MAF,
snp=res.all$id.snp,
position=1:nrow(res.all))
}
## do naive coloc as well
naive.coloc <- coloc.signals(D1, D2, method="single", p12=5e-6)
## add checks to the data
naive.coloc$summary$ld.preserved.overlap <- it
naive.coloc$summary$ld.check.top.overlap <- io
naive.coloc$summary$ld.check.top.general <- is
## add the trait id
naive.coloc$summary$id.ieu <- phewas.results$id.ieu[x]
#-----------------------------------------#
##-- draw selected --##
#-----------------------------------------#
if(naive.coloc$summary$PP.H4.abf > .7 | naive.coloc$summary$ld.check.top.overlap > .8){
source("scripts/plot_coloc_results.R")
png(paste0("graphics/", olink, ".", phewas.results$id.ieu[x], ".", chr.s, ".", pos.s, ".", pos.e, ".png"), width=16, height=8, units="cm", res=150)
par(mar=c(1.5,1.5,1,.5), mgp=c(.6,0,0), cex.axis=.5, cex.lab=.5, tck=.01, cex.main=.6, font.main=2)
## more complex layout for gene assignment
layout(matrix(c(1,1,1,2,3,4),3,2), heights = c(.43,.37,.2))
plot.regional.coloc(res.all, naive.coloc$summary, snp.dat, phewas.results$id.ieu[x], olink, snp.info)
dev.off()
}
## write results to file
return(naive.coloc$summary)
})
res.coloc <- do.call(rbind, res.coloc)
#-----------------------------------------#
##-- combine and store the data --##
#-----------------------------------------#
## combine both
phewas.results <- merge(phewas.results, res.coloc[, c("id.ieu", "nsnps", "PP.H0.abf", "PP.H1.abf", "PP.H2.abf", "PP.H3.abf", "PP.H4.abf",
"ld.preserved.overlap", "ld.check.top.overlap", "ld.check.top.general")])
## add phenotype (be precise in region tested to account for proteins encoded by multiple genes)
phewas.results$pheno <- paste(olink, chr.s, pos.s, pos.e, sep=".")
## write to file
write.table(phewas.results, paste("output/results.3k.phewas", olink, chr.s, pos.s, pos.e, "txt", sep="."), sep="\t", row.names=F)
}else{
cat("found no evidence for", olink, "on chr", chr.s, "between", pos.s, "and", pos.e, "\n")
## write to file
write.table(data.frame(pheno=paste(olink, chr.s, pos.s, pos.e, sep="."), id.ieu=NA),
paste("output/results.3k.phewas", olink, chr.s, pos.s, pos.e, "txt", sep="."), sep="\t", row.names=F)
}
}else{
cat("found no evidence for", olink, "on chr", chr.s, "between", pos.s, "and", pos.e, "\n")
## write to file
write.table(data.frame(pheno=paste(olink, chr.s, pos.s, pos.e, sep="."), id.ieu=NA),
paste("output/results.3k.phewas", olink, chr.s, pos.s, pos.e, "txt", sep="."), sep="\t", row.names=F)
}