/
locate_url.R
239 lines (225 loc) · 7.75 KB
/
locate_url.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#' Construct the URL to access a particular `recount3` file
#'
#' Given an organism of interest, this function constructs the URL for accessing
#' one of the output files from the `recount3` project. You can then download
#' the file using `file_retrieve()`.
#'
#' @param project A `character(1)` with the ID for a given study.
#' @param project_home A `character(1)` with the home directory for the
#' `project`. You can find these using `project_homes()`.
#' @param type A `character(1)` specifying whether you want to access gene
#' counts, exon counts, exon-exon junctions or base-pair BigWig coverage files
#' (one per `sample`).
#' @param organism A `character(1)` specifying which organism you want to
#' download data from. Supported options are `"human"` or `"mouse"`.
#' @param sample A `character()` vector with the sample ID(s) you want to
#' download.
#' @param annotation A `character(1)` specifying which annotation you want to
#' download. Only used when `type` is either `gene` or `exon`.
#' @param jxn_format A `character(1)` specifying whether the exon-exon junction
#' files are derived from all the reads (`ALL`) or only the uniquely mapping
#' read counts (`UNIQUE`). Note that `UNIQUE` is only available for some
#' projects: GTEx and TCGA for human.
#' @param recount3_url A `character(1)` specifying the home URL for `recount3`
#' or a local directory where you have mirrored `recount3`. Defaults to the
#' load balancer <http://duffel.rail.bio/recount3>, but can also be
#' <https://recount-opendata.s3.amazonaws.com/recount3/release> from
#' <https://registry.opendata.aws/recount/> or SciServer datascope from
#' IDIES at JHU <https://sciserver.org/public-data/recount3/data>. You can
#' set the R option `recount3_url` (for example in your `.Rprofile`) if
#' you have a favorite mirror.
#'
#' @return A `character()` with the URL(s) for the file(s) of interest.
#' @export
#'
#' @family internal functions for accessing the recount3 data
#' @examples
#'
#' ## Example for metadata files from a project from SRA
#' locate_url(
#' "SRP009615",
#' "data_sources/sra"
#' )
#'
#' ## Example for metadata files from a project that is part of a collection
#' locate_url(
#' "ERP110066",
#' "collections/geuvadis_smartseq",
#' recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3"
#' )
#'
#' ## Example for a BigWig file
#' locate_url(
#' "SRP009615",
#' "data_sources/sra",
#' "bw",
#' "human",
#' "SRR387777"
#' )
#'
#' ## Locate example gene count files
#' locate_url(
#' "SRP009615",
#' "data_sources/sra",
#' "gene"
#' )
#' locate_url(
#' "SRP009615",
#' "data_sources/sra",
#' "gene",
#' annotation = "refseq"
#' )
#'
#' ## Example for a gene count file from a project that is part of a collection
#' locate_url(
#' "ERP110066",
#' "collections/geuvadis_smartseq",
#' "gene",
#' recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3"
#' )
#'
#' ## Locate example junction files
#' locate_url(
#' "SRP009615",
#' "data_sources/sra",
#' "jxn"
#' )
#'
#' ## Example for metadata files from a project from SRA
#' locate_url(
#' "ERP001942",
#' "data_sources/sra"
#' )
locate_url <-
function(project,
project_home = project_homes(
organism = organism,
recount3_url = recount3_url
),
type = c("metadata", "gene", "exon", "jxn", "bw"),
organism = c("human", "mouse"),
sample = NULL,
annotation = annotation_options(organism),
jxn_format = c("ALL", "UNIQUE"),
recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3")) {
project_home <- match.arg(project_home)
type <- match.arg(type)
organism <- match.arg(organism)
annotation <- match.arg(annotation)
jxn_format <- match.arg(jxn_format)
## Define the base directories
base_dir <- switch(type,
metadata = "metadata",
gene = "gene_sums",
exon = "exon_sums",
jxn = "junctions",
bw = "base_sums"
)
## Define the annotation to work with
ann_ext <-
annotation_ext(organism = organism, annotation = annotation)
## Define the file extensions
file_ext <- paste0(".", switch(type,
metadata = "MD.gz",
gene = paste0(ann_ext, ".gz"),
exon = paste0(ann_ext, ".gz"),
jxn = paste0(jxn_format, ".", c("MM.gz", "RR.gz", "ID.gz")),
bw = "ALL.bw"
))
## Check that sample exists when type == 'bw'
if (type == "bw") {
if (is.null(sample)) {
stop("You need to specify the 'sample' when type = 'bw'.",
call. = FALSE
)
}
}
## Base URL
base_url <- file.path(
recount3_url,
organism,
project_home,
base_dir,
substr(project, nchar(project) - 1, nchar(project)),
project
)
## Metadata case
if (type == "metadata") {
file_tag <- c(
basename(project_home),
"recount_project",
"recount_qc",
"recount_seq_qc",
"recount_pred"
)
} else {
file_tag <- base_dir
}
## Define the base file path
base_file <-
paste0(basename(project_home), ".", file_tag, ".", project)
## Handle the BigWig file case
if (type == "bw") {
base_url <- file.path(
base_url,
toupper(substr(
sample,
nchar(sample) - ifelse(grepl("gtex", project_home), 3, 1),
nchar(sample) - ifelse(
grepl("gtex", project_home),
2,
0
)
))
)
base_file <- paste0(base_file, "_", sample)
}
## Construct the final url(s)
if (dirname(project_home) == "collections") {
## Deal with metadata collection case
## Locate the file source from the metadata files
url_collection_meta <- file.path(
recount3_url,
organism,
project_home,
"metadata",
paste0(basename(project_home), ".recount_project.gz")
)
names(url_collection_meta) <-
basename(url_collection_meta)
metadata <- read_metadata(file_retrieve(url = url_collection_meta))
i <- which(metadata$recount_project.project == project)
stopifnot("The 'project' is not part of this collection." = length(i) > 0)
file_source <-
metadata$recount_project.file_source[i[1]]
## Find the files from the file source
url <- locate_url(
project = project,
project_home = file_source,
type = type,
organism = organism,
sample = sample,
annotation = annotation,
recount3_url = recount3_url
)
## Deal with metadata collection case
if (type == "metadata") {
## Add the custom collection metadata
url <- c(
url,
file.path(
recount3_url,
organism,
project_home,
"metadata",
paste0(basename(project_home), ".custom.gz")
)
)
}
} else {
url <- file.path(base_url, paste0(base_file, file_ext))
}
names(url) <- basename(url)
## Done
return(url)
}