Merge branch 'master' into dev

IQSS · Feb 2, 2020 · ffd786f · ffd786f
2 parents 9d22dad + 0b3d67c
commit ffd786f
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 35 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,21 +5,33 @@ Authors@R: c(
   person(
     "Will", "Beasley",
     role=c("aut", "cre"),
-    email="wibeasley@hotmail.com", 
+    email="wibeasley@hotmail.com",
     comment = c(ORCID = "0000-0002-5613-5006")
   ),
   person(
     "Thomas J.", "Leeper",
-    role = c("aut"), 
+    role = c("aut"),
     email = "thosjleeper@gmail.com",
     comment = c(ORCID = "0000-0003-4097-6326")
   ),
   person(
     "Philip", "Durbin",
-    role = c("aut"), 
+    role = c("aut"),
     email = "philipdurbin@gmail.com",
     comment = c(ORCID = "0000-0002-9528-9470")
   ),
+  person(
+    "Shiro", "Kuriwaki",
+    role = c("aut"), 
+    email = "shirokuriwaki@gmail.com",
+    comment = c(ORCID = "0000-0002-5687-2647")
+  ),
+  person(
+    "Sebastian", "Karcher",
+    role=c("aut"),
+    email="karcher@u.northwestern.edu",
+    comment = c(ORCID = "0000-0001-8249-7388")
+  ),
   person(
     "Jan", "Kanis", role = "ctb"
   ))
@@ -39,8 +51,8 @@ Suggests:
     testthat,
     UNF,
     yaml
-Description: Provides access to Dataverse version 4 APIs <https://dataverse.org/>, 
-    enabling data search, retrieval, and deposit. For Dataverse versions <= 4.0, 
+Description: Provides access to Dataverse version 4 APIs <https://dataverse.org/>,
+    enabling data search, retrieval, and deposit. For Dataverse versions <= 4.0,
     use the deprecated 'dvn' package <https://cran.r-project.org/package=dvn>.
 License: GPL-2
 URL: https://github.com/iqss/dataverse-client-r

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,11 @@
 # CHANGES TO dataverse 0.2.2 (upcoming)
 
+* Make filter queries (fq) work in `dataverse_search` (#36 @adam3smith)
 * Update maintainer to Will Beasley (wibeasley@hotmail.com) (#38)
 * More robust file retrieval (#39 @kuriwaki)
 * Tests use https://demo.dataverse.org/dataverse/dataverse-client-r/. (#40)
+* Fixes most get_file errors by removing query argument (#33 kuriwaki)
+* Fix getting multiple files by id in `get_file()` (#47 @adam3smith)
 
 # CHANGES TO dataverse 0.2.1
 

diff --git a/R/dataverse_search.R b/R/dataverse_search.R
@@ -93,9 +93,10 @@ function(...,
     query[["show_relevance"]] <- show_relevance
     ## show_facets
     query[["show_facets"]] <- show_facets
-    ## fq
-    if (!is.null(start)) {
-        query[["fq"]] <- match.arg(fq)
+    ## fq 
+    # we're passing the unencoded fq string on to the API using I() as the API doesn't handle encoded strings properly
+    if (!is.null(fq)) {
+      query[["fq"]] <- I(fq)
     }
 
     # setup URL

diff --git a/R/get_file.R b/R/get_file.R
@@ -2,7 +2,7 @@
 #' @title Download File(s)
 #' @description Download Dataverse File(s)
 #' @details This function provides access to data files from a Dataverse entry.
-#' @param file An integer specifying a file identifier; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.
+#' @param file An integer specifying a file identifier; or a vector of integers specifying file identifiers; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.
 #' @template ds
 #' @param format A character string specifying a file format. For \code{get_file}: by default, this is \dQuote{original} (the original file format). If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a compressed directory containing a bundle of file formats is returned. For \code{get_file_metadata}, this is \dQuote{ddi}.
 #' @param vars A character vector specifying one or more variable names, used to extract a subset of the data.
@@ -30,6 +30,9 @@
 #' flist <- dataset_files(2692151)
 #' get_file(flist[[2]])
 #'
+#' # retrieve all files in a dataset in their original format (returns a list of raw vectors)
+#' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id
+#' f3 <- get_file(file_ids, format = "original")
 #' # read file as data.frame
 #' if (require("rio")) {
 #'   tmp <- tempfile(fileext = ".dta")
@@ -54,6 +57,10 @@ get_file <-
            server = Sys.getenv("DATAVERSE_SERVER"),
            ...) {
     format <- match.arg(format)
+
+    # single file ID
+    if (is.numeric(file))
+      fileid <- file
 
     # get file ID from 'dataset'
     if (!is.numeric(file)) {
@@ -67,35 +74,37 @@ get_file <-
     } else {
       fileid <- file
     }
+
 
-    # request multiple files -----
-    if (length(fileid) > 1) {
-        fileid <- paste0(fileid, collapse = ",")
-        u <- paste0(api_url(server), "access/datafiles/", file)
-        r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
-        httr::stop_for_status(r)
-        tempf <- tempfile(fileext = ".zip")
-        tempd <- tempfile()
-        dir.create(tempd)
-        on.exit(unlink(tempf), add = TRUE)
-        on.exit(unlink(tempd), add = TRUE)
-        writeBin(httr::content(r, as = "raw"), tempf)
-        to_extract <- utils::unzip(tempf, list = TRUE)
-        out <- lapply(to_extract$Name[to_extract$Name != "MANIFEST.TXT"], function(zipf) {
-            utils::unzip(zipfile = tempf, files = zipf, exdir = tempd)
-            readBin(file.path(tempd, zipf), "raw", n = 1e8)
-      })
-      return(out)
-    }
+    # # request multiple files -----
+    # if (length(fileid) > 1) {
+    #     fileid <- paste0(fileid, collapse = ",")
+    #     u <- paste0(api_url(server), "access/datafiles/", fileid)
+    #     r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
+    #     httr::stop_for_status(r)
+    #     tempf <- tempfile(fileext = ".zip")
+    #     tempd <- tempfile()
+    #     dir.create(tempd)
+    #     on.exit(unlink(tempf), add = TRUE)
+    #     on.exit(unlink(tempd), add = TRUE)
+    #     writeBin(httr::content(r, as = "raw"), tempf)
+    #     to_extract <- utils::unzip(tempf, list = TRUE)
+    #     out <- lapply(to_extract$Name[to_extract$Name != "MANIFEST.TXT"], function(zipf) {
+    #       utils::unzip(zipfile = tempf, files = zipf, exdir = tempd)
+    #       readBin(file.path(tempd, zipf), "raw", n = 1e8)
+    #     })
+    #     return(out)
+    # }
 
-    # request single file -----
-    if (length(fileid) == 1) {
+    # downloading files sequentially and add the raw vectors to a list
+    out <- vector("list", length(fileid))
+    for (i in 1:length(fileid)) {
         if (format == "bundle") {
-            u <- paste0(api_url(server), "access/datafile/bundle/", fileid)
+            u <- paste0(api_url(server), "access/datafile/bundle/", fileid[i])
             r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
         }
         if (format != "bundle") {
-            u <- paste0(api_url(server), "access/datafile/", fileid)
+            u <- paste0(api_url(server), "access/datafile/", fileid[i])
             query <- list()
             if (!is.null(vars)) {
                 query$vars <- paste0(vars, collapse = ",")
@@ -106,15 +115,23 @@ get_file <-
 
             # request single file in non-bundle format ----
             # add query if ingesting a tab (detect from original file name)
-            if (length(query) == 1 & grepl("\\.tab$", file)) {
+            if (length(query) == 1 & grepl("\\.tab$", file[i])) {
                 r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...)
             } else {
                 # do not add query if not an ingestion file
                 r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
             }
         }
         httr::stop_for_status(r)
-        return(httr::content(r, as = "raw"))
+        out[[i]] <-  httr::content(r, as = "raw")
+    }
+    # return the raw vector if there's a single file
+    if (length(out) == 1) {
+      return (out[[1]])
+    }
+    else {
+      # return a list of raw vectors otherwise
+      return (out)
     }
   }
 

diff --git a/man/files.Rd b/man/files.Rd
diff --git a/tests/testthat/tests-get_file.R b/tests/testthat/tests-get_file.R
@@ -19,3 +19,22 @@ test_that("download file from file id", {
   expect_true(is.raw(actual))
   expect_true(1000 < object.size(actual)) # Should be 1+ KB
 })
+
+test_that("download multiple files with file id - no folder", {
+  file_ids <- get_dataset("doi:10.70122/FK2/LZAJEQ", server = "demo.dataverse.org")[['files']]$id
+  actual <- get_file(
+    file_ids,
+    format="original",
+    server = "demo.dataverse.org")
+  expect_true(length(actual) == 2) # two files in the dataset
+  expect_true(is.raw(actual[[2]]))
+  expect_true(object.size(actual[[2]]) > 300) # Should be >300 B
+})
+
+test_that("download multiple files with file id - with folders", {
+  file_ids <- get_dataset("doi:10.70122/FK2/V54HGA", server = "demo.dataverse.org")[['files']]$id
+  actual <- get_file(file_ids, format="original", server = "demo.dataverse.org")
+  expect_true(length(actual) == 2) # two files in the dataset
+  expect_true(is.raw(actual[[2]]))
+  expect_true(object.size(actual[[2]]) > 70) # Should be >70 B
+})
diff --git a/tests/testthat/tests-search.R b/tests/testthat/tests-search.R
@@ -11,3 +11,19 @@ test_that("named argument search", {
 test_that("simple search w/type argument", {
     expect_true(is.data.frame(dataverse_search(author = "Gary King", type = "dataset", key = "", server = "dataverse.harvard.edu")))
 })
+
+test_that("date range search using fq", {
+    expect_true(is.data.frame(dataverse_search("*", fq = "dateSort:[2018-01-01T00:00:00Z+TO+2019-01-01T00:00:00Z]", type = "dataset", key = "", server = "dataverse.harvard.edu")))
+})
+
+test_that("publication year using fq", {
+    expect_true(is.data.frame(dataverse_search("*", fq = "publicationDate:2018", type = "dataset", key = "", server = "dataverse.harvard.edu")))
+})
+
+test_that("filter dataverses by subject using fq", {
+    expect_true(is.data.frame(dataverse_search("*", fq = "subject_ss:Social+Sciences", type = "dataverse", key = "", server = "dataverse.harvard.edu")))
+})
+
+test_that("empty fq search", {
+    expect_length(dataverse_search("*", fq = "dateSort:[2019-02-01T00:00:00Z+TO+2019-01-01T00:00:00Z]", type = "dataset", key = "", server = "dataverse.harvard.edu"), 0)
+})