-
Notifications
You must be signed in to change notification settings - Fork 4
/
misc.R
567 lines (511 loc) · 20.9 KB
/
misc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
# Add package by alphabetical order
#' @importFrom data.table rbindlist as.data.table fread
#' @import dplyr
#' @import forcats
#' @import ggplot2
#' @importFrom grDevices dev.off pdf
#' @importFrom gridExtra grid.arrange arrangeGrob
#' @importFrom httr status_code GET
#' @importFrom inspectdf inspect_na
#' @importFrom jsonlite fromJSON
#' @import knitr
#' @importFrom lubridate parse_date_time
#' @import naniar
#' @import progress
#' @import purrr
#' @importFrom readr read_lines read_delim
#' @importFrom scales percent
#' @importFrom stats median reorder
#' @import stringr
#' @import tidyr
#' @importFrom utils URLencode read.csv read.delim write.table
#' @import viridis
#____________________________________________________________________________
#' @title Create folder
#'
#' @description Create a directory if it doesn't exist. If no argument is provided,
#' it returns the current working directory
#' @param folder_name (chr) folder name
#' @param verbose (logical) `TRUE` shows messages (default `FALSE`)
#' @examples {
#' create_folder(folder_name = NULL)
#' # Or use this one for a real folder:
#' # create_folder(folder_name = "testing")
#' }
#' @export
create_folder <- function(folder_name = NULL,
verbose = FALSE){
if(!is.null(folder_name)){
if(!dir.exists(file.path(folder_name))){
dir.create(file.path(folder_name), recursive = TRUE)
if(verbose) message("+ Folder `", folder_name,"`created")
return(folder_name)
}else{
return(folder_name)
}
}else{
folder_name <- getwd()
return(folder_name)
}
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Download and Read File from Google Cloud Storage
#'
#' This function downloads a file from Google Cloud Storage (GCS) to a local
#' directory and reads it into R as a data frame. It uses the `gsutil`
#' command-line tool to handle the file download.
#'
#' @param path Character. The path to the file in GCS, e.g., `gs://bucket-name/file-name.csv`.
#' @param sep Character. The field separator character. Default is `\t`.
#' @param header Logical. Whether the file contains the names of the variables
#' as its first line. Default is TRUE.
#' @param tmpdir Character. The local directory to which the file will be
#' downloaded.
#' @param gsutil_path Character. The path to the `gsutil` command-line tool.
#' Default is "gsutil".
#' @param check_first Logical. Whether to check if the file already exists
#' locally before downloading. Default is TRUE.
#' @param verbose Logical. If TRUE, prints messages about the download process.
#' Default is FALSE.
#' @param ... Additional arguments passed to `readr::read_delim`.
#'
#' @details
#' This function first checks if the specified file exists in GCS. If the file
#' exists, it downloads the file to the specified local directory (`tmpdir`). If
#' the local directory does not exist, it will be created. The function handles
#' spaces in directory paths by quoting them appropriately. If the file is
#' successfully downloaded, it is read into R using `readr::read_delim`.
#'
#' If the `check_first` argument is set to TRUE, the function will first check
#' if the file already exists locally to avoid redundant downloads. If the file
#' is already present locally, it will not be downloaded again.
#'
#' @return A data frame containing the contents of the downloaded file.
#'
#' @examples
#' \dontrun{
#' df <- dl_read_gcp(
#' path = "gs://bucket-name/file-name.csv",
#' sep = ",",
#' header = TRUE,
#' tmpdir = "/local/path",
#' gsutil_path = "gsutil",
#' check_first = TRUE,
#' verbose = TRUE
#' )
#' }
#'
#' @export
dl_read_gcp <- function(path,
sep = "\t",
header = TRUE,
tmpdir,
gsutil_path = "gsutil",
check_first = TRUE,
verbose = FALSE,
...){
# Detect the operating system
os_name <- Sys.info()["sysname"]
# Default arguments for Mac
ignore_std_err <- TRUE
ignore_std_out <- TRUE
# Change default arguments if the OS is Windows
if (os_name == "Windows") {
ignore_std_err <- FALSE
ignore_std_out <- FALSE
}
# Validate gsutil path first
validate_cmd <- sprintf('%s version', gsutil_path)
if(verbose) message(paste0("- Validating `gsutil_path` on your system: ", gsutil_path))
gsutil_valid <- tryCatch({
system(validate_cmd, ignore.stdout = ignore_std_err, ignore.stderr = ignore_std_out) == 0
}, warning = function(w) {
FALSE
}, error = function(e) {
FALSE
})
if(!gsutil_valid){
stop("The gsutil path is incorrect or gsutil is not installed. Please ensure that gsutil is installed and the `gsutil_path` is correct.")
}
# Check if the file exists in GCP
check_cmd <- sprintf('%s ls %s', gsutil_path, path)
file_exists <- system(check_cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err) == 0
if(!file_exists){
stop(paste0("\nThe file `", path, "` does not exist in GCP"))
}
# Create directory
if(!dir.exists(tmpdir)){
dir.create(tmpdir)
if(verbose) message(paste0("- New folder `", tmpdir, "` created successfully"))
}else{
if(verbose) message(paste0("- Folder `", tmpdir, "` already exists"))
}
# create the normalized version of the destination path
tmpdir_norm <- normalizePath(tmpdir)
# if the normalized path name contains spaces,
# add shell quotes before it is saved to tmpdir,
# which ultimately goes to system()
if(grepl("\\s", tmpdir_norm)){
tmpdir <- shQuote(tmpdir_norm)
if(verbose) message("- The temp folder has spaces")
} else{
# Otherwise, tmpdir_norm and tmpdir can remain the same
tmpdir <- tmpdir_norm
}
# Check path
if(!grepl("gs:\\/\\/", path)){
stop("The path to the bucket is wrong. Valid example: gs://bucket-name/file-name.csv")
}else{
new_path <- file.path(tmpdir_norm, basename(path))
}
# only download if it doesn't exist to avoid conflicts when running this
# script in parallel; clear scratch space when you're done
if(check_first){
if( !file.exists(new_path) ){
# cp file from GCP
cmd <- sprintf('%s cp %s %s', gsutil_path, path, tmpdir)
if(verbose) message(paste0("- Running command ", cmd))
system(cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err)
if(verbose) message("- Downloaded file: ", new_path)
}else{
if(verbose) message(paste0("- The file `", new_path, "` already exists. LOADING EXISTING VERSION"))
}
}else{
if(verbose) message(paste0("- Downloading file (from GCP): `", basename(path), "`"))
cmd <- sprintf('%s cp %s %s', gsutil_path, path, tmpdir)
system(cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err)
if(verbose) message("- Downloaded file: ", new_path)
}
# read in the data using readr instead of data.table
if(file.exists(new_path)){
df <- readr::read_delim(new_path,
delim = sep,
col_names = header,
skip_empty_rows = TRUE,
show_col_types = FALSE, ...)
df <- as.data.frame(df)
return(df)
}else{
stop("Problems loading the file. Two possible reasons:
- Something might have gone wrong with the download.
- This is not a tab-delimited file (default): if you are trying to download a csv file instead, then use `sep = \",\"` instead.
Re-run the command again with `verbose = TRUE`)")
}
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Generate the phase detail for submissions
#'
#' @description The phase details is as simple as creating a lower case version
#' of the phase. However, in case of PASS1A/1C a new version has to be generated:
#' pass1ac-06
#' This function detects whether there are two phases, and if so,
#' generate the expected version: either pass1ac-06 or pass1ac-18
#' @param phase_metadata (char) expected output of `set_phase`
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (char) the expected phase_details function
#' @export
generate_phase_details <- function(phase_metadata,
verbose = TRUE){
if( grepl("\\|", phase_metadata) ){
pass1st <- gsub("(.*)(\\|.*)", "\\1", phase_metadata)
animalage <- gsub("(PASS1A\\-)(\\d+)", "\\2", pass1st)
phase_details <- paste0("pass1ac-", animalage)
}else{
phase_details <- tolower(phase_metadata)
}
return(phase_details)
}
#' @title Get full path to the batch folder
#'
#' @description Get the full path to the batch folder
#' @param input_results_folder (char) path to the PROCESSED/RESULTS folder to check
#' @return (char) Full path to the `BATCH#_YYYYMMDD` folder
#' @export
get_full_path2batch <- function(input_results_folder){
batch <- NULL
if( grepl("(BIC){0,1}RESULTS", input_results_folder) ){
batch <- gsub("(.*/)((BIC){0,1}RESULTS.*)", "\\1", input_results_folder)
}else if( grepl("PROCESSED", input_results_folder)){
batch <- gsub("(.*)(PROCESSED.*)", "\\1", input_results_folder)
}else{
stop(" - (-) ERROR: the input results folder missed the PROCESSED or RESULTS folder!")
}
return(batch)
}
#' @title filter required columns only
#'
#' @description it returns a data frame with only the required columns for metabolomics and proteomics
#' @param df (data.frame) metadata_metabolites
#' @param type (char) Type of file to filter columns:
#' - `m_m`: metadata metabolites
#' - `m_s`: metadata samples
#' - `v_m`: proteomics vial_metadata
#' - `olproteins`: olink metadata proteins
#' - `olsamples`: olink metadata samples
#' @param name_id (char) specify whether `named` or `unnamed` files
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (data.frame) filtered data frame with only the required columns
#' @examples {
#' df_filtered <- filter_required_columns(df = metadata_metabolites_named, name_id = "named")
#' }
#' @export
filter_required_columns <- function(df,
type = c("m_m",
"m_s",
"v_m",
"olproteins",
"olsamples"),
name_id = NULL,
verbose = TRUE){
type <- match.arg(type)
if (type == "m_m"){
# Define required columns
if(name_id == "named"){
emeta_metabo_coln_named <- c("metabolite_name", "refmet_name", "rt", "mz", "neutral_mass", "formula")
}else if(name_id == "unnamed"){
if("neutral_mass" %in% colnames(df)){
emeta_metabo_coln_named <- c("metabolite_name", "rt", "mz", "neutral_mass")
}else{
emeta_metabo_coln_named <- c("metabolite_name", "rt", "mz")
}
}else{
stop("{`name_id`} option not valid. Options: named/unnamed")
}
# Now check if present
colnames(df) <- tolower(colnames(df))
missing_cols <- setdiff(emeta_metabo_coln_named, colnames(df))
if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_metabolite`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
} else {
if(verbose) message(" + (+) All required columns present")
df <- subset(df, select = emeta_metabo_coln_named)
}
return(df)
} else if (type == "m_s"){
emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "raw_file", "extraction_date", "acquisition_date", "lc_column_id")
missing_cols <- setdiff(emeta_sample_coln, colnames(df))
if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_sample`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
} else {
if(verbose) message(" + (+) All required columns present")
df <- subset(df, select = emeta_sample_coln)
}
return(df)
} else if (type == "v_m"){
emeta_sample_coln <- c("vial_label", "tmt_plex")
if( all(emeta_sample_coln %in% colnames(df)) ){
# deal with tmt11 or tmt16
if("tmt11_channel" %in% colnames(df)){
emeta_sample_coln <- append(emeta_sample_coln, "tmt11_channel")
if(verbose) message(" + (+) All required columns present (tmt11 experiment)")
df <- subset(df, select = emeta_sample_coln)
}else if("tmt16_channel" %in% colnames(df)){
emeta_sample_coln <- append(emeta_sample_coln, "tmt16_channel")
if(verbose) message(" + (+) All required columns present (tmt16 experiment)")
df <- subset(df, select = emeta_sample_coln)
}else if("tmt18_channel" %in% colnames(df)){
emeta_sample_coln <- append(emeta_sample_coln, "tmt18_channel")
if(verbose) message(" + (+) All required columns present (tmt18 experiment)")
df <- subset(df, select = emeta_sample_coln)
}else{
if(verbose) message(" - (-) Expected COLUMN NAMES are missed: FAIL")
}
}else{
if(verbose) message(" - (-) Expected COLUMN NAMES are missed: FAIL")
}
return(df)
} else if (type == "olproteins"){
emeta_sample_coln <- c("olink_id", "uniprot_entry", "assay", "missing_freq", "panel_name", "panel_lot_nr", "normalization")
missing_cols <- setdiff(emeta_sample_coln, colnames(df))
if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_proteins`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
} else {
if(verbose) message(" + (+) All required columns present")
df <- subset(df, select = emeta_sample_coln)
}
return(df)
}else if (type == "olsamples"){
emeta_sample_coln <- c("sample_id", "sample_type", "sample_order", "plate_id")
missing_cols <- setdiff(emeta_sample_coln, colnames(df))
if (length(missing_cols) > 0) {
if(verbose) message(" - (-) `metadata_samples`: Expected COLUMN NAMES are missed: FAIL")
message(paste0("\t The following required columns are not present: `", paste(missing_cols, collapse = ", "), "`"))
} else {
if(verbose) message(" + (+) All required columns present")
df <- subset(df, select = emeta_sample_coln)
}
return(df)
}
}
#' @title open files
#'
#' @description open files and check that they are right
#' @param input_results_folder (char) input path folder
#' @param filepattern (char) regular expression to find a file in the file system
#' provided
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (list) list with data frame and flag about the status
#' @export
open_file <- function(input_results_folder,
filepattern,
verbose = TRUE){
if( !dir.exists(input_results_folder) ){
flag <- FALSE
ofile <- NULL
filename <- NULL
if(verbose) message(" - (-) The folder doesn't exist: FAIL")
list_back <- list("flag" = flag, "df" = ofile, "filename" = filename)
return(list_back)
}
# Get file matching pattern
file_metametabolites <- list.files(normalizePath(input_results_folder),
pattern=filepattern,
ignore.case = TRUE,
full.names=TRUE,
recursive = TRUE)
# Check if file is found and deal with many files
if(length(file_metametabolites) != 1){
if(length(file_metametabolites) >= 1){
if(verbose) message(" - (-) More than one file detected: FAIL")
if(verbose) message("\t\t - ", paste(file_metametabolites, collapse = "\n\t\t - "))
}else{
if(verbose) message(" - (-) File [`", filepattern, "`] not found: FAIL")
}
flag <- FALSE
ofile <- NULL
filename <- NULL
}else{
filename <- file_metametabolites[1]
file_ext <- sub(".*\\.(.*)$", "\\1", filename)
if (!file_ext %in% c("txt", "tsv")) {
if(verbose) message(" - (-) File extension must be .txt or .tsv (only tab delimited files accepted): FAIL")
}else{
ofile <- read.delim(filename, stringsAsFactors = FALSE, check.names = FALSE)
ofile <- remove_empty_columns(ofile, verbose = verbose)
ofile <- remove_empty_rows(ofile, verbose = verbose)
if(verbose) message(" + (+) File successfully opened")
flag <- TRUE
}
}
if(flag){
if(nrow(ofile) == 0){
if(verbose) message(" - (-) File is empty: FAIL")
flag <- FALSE
ofile <- NULL
}else{
flag <- TRUE
}
}
list_back <- list("flag" = flag, "df" = ofile, "filename" = filename)
return(list_back)
}
#' @title remove empty columns
#'
#' @description remove empty columns
#' @param df (char) data frame
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (df) df without empty columns
#' @export
remove_empty_columns <- function(df,
verbose = TRUE){
df[df == ""] <- NA
before <- dim(df)[2]
emptycols <- sapply(df, function (x) all(is.na(x)))
df <- df[!emptycols]
after <- dim(df)[2]
if(before != after){
n_removed <- before - after
if(verbose) message(" - (-) ", n_removed, " empty columns found and removed")
if(verbose) message("\t\t+ Before: ", before, " -> After: ", after)
}
return(df)
}
#' @title remove empty rows in data frame
#'
#' @description remove empty rows in data frame
#' @param df (char) data frame
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (df) df without empty columns
#' @export
remove_empty_rows <- function(df,
verbose = TRUE){
# Remove all rows with NAs or white spaces
# 1. Check empty spaces and make them NAs
before <- dim(df)[1]
df[df == ""] <- NA
# 2. Remove rows that are all NAs
df <- df[apply(df, 1, function(x) !all(is.na(x))),]
after <- dim(df)[1]
if(before != after){
n_removed <- before - after
if(verbose) message(" - (-) ", n_removed, " empty ROWS found and remove")
if(verbose) message("\t\t+ Before: ", before, " -> After: ", after)
}
return(df)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Set the phase to be validated.
#'
#' @description A group might choose to combine two different phases, due to
#' the complications associated with PASS1A/1C. If they choose to combine
#' two phases, the CAS must provide a new file `metadata_phase.txt` with a single
#' line, as for example: `PASS1A-06|PASS1C-06`. This function checks if the
#' file is available, and set that phase as the phases to validate. In summary,
#' the order of preference is:
#' 1. function's argument: dmaqc_phase2validate (if provided in the validation functions)
#' 2. `metadata_phase.txt` file if available in the batch folder.
#' 3. Phase in folder structure
#' @param input_results_folder (char) path to the PROCESSED/RESULTS folder to check
#' @param dmaqc_phase2validate (data.frame) dmaqc shipping information
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (int) the phase to be validated.
#' @export
set_phase <- function(input_results_folder,
dmaqc_phase2validate,
verbose = TRUE){
phase <- validate_phase(input_results_folder)
# Check metadata_phase.txt file
batch <- get_full_path2batch(input_results_folder)
file_phase <- list.files(normalizePath(batch),
pattern="metadata_phase.txt",
ignore.case = TRUE,
full.names=TRUE,
recursive = TRUE)
if(length(file_phase) > 1){
if(verbose) message("- (-) `More than one `metadata_phase.txt` file available. Only one is valid (place the valid one in the BATCH folder): FAIL")
}
# To be adjusted if two different batches are provided:
if ( !(purrr::is_empty(file_phase)) ){
phase_details <- readr::read_lines(file_phase[1], n_max = 1)
if ( !(is.na(phase_details) || phase_details == '') ){
if(verbose) message("+ Motrpac phase reported: ", phase_details, " (info from metadata_phase.txt available): OK")
if( grepl("\\|", phase_details) ){
validate_two_phases(phase_details = phase_details, verbose = FALSE)
}
# And once is checked, proceed...
if( isFALSE(dmaqc_phase2validate) ){
dmaqc_phase2validate <- phase_details
}
}else{
if(verbose) message("+ Motrpac phase: ", phase, " (metadata_phase.txt available but EMPTY): FAIL")
if( isFALSE(dmaqc_phase2validate) ){
dmaqc_phase2validate <- phase
}
}
}else{
if(verbose) message("+ Motrpac phase: ", phase, " (metadata_phase.txt file NOT available): FAIL")
if( isFALSE(dmaqc_phase2validate) ){
dmaqc_phase2validate <- phase
}
}
return(dmaqc_phase2validate)
}