Merge pull request #330 from JanMarvin/gh_issue_327

wb_to_df gets startCol
JanMarvin · Sep 22, 2022 · 81107fe · 81107fe
2 parents 3c1637a + 7171afb
commit 81107fe
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 32 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 ## New features
 
+* New argument `startCol` in read to data frame functions `wb_to_df()`, `wb_read()` and `read_xlsx()`. [330](https://github.com/JanMarvin/openxlsx2/issues/330)
+
 * New function `wb_colour()` to ease working with colour vectors used in `openxlsx2` styles. [292](https://github.com/JanMarvin/openxlsx2/issues/292)
 
 * Deprecated `get_cell_style()` and `set_cell_style()` in favor of newly introduced wrapper functions `wb_get_cell_style()` and `wb_set_cell_style()`. [306](https://github.com/JanMarvin/openxlsx2/issues/306)
@@ -40,6 +42,14 @@
 
 * Various (mostly internal) changes to `conditional_formatting`. Created `style_mgr` integration for `dxf` (cf-styles) and cleaned up internal code. The syntax has changed slightly, see [conditional formatting vignette](https://janmarvin.github.io/openxlsx2/articles/conditional-formatting.html) for reference. Add `whitespace` argument to `read_xml()`. [268](https://github.com/JanMarvin/openxlsx2/issues/268)
 
+## Breaking changes
+
+* Order of arguments in reading functions `wb_to_df()`, `wb_read()` and `read_xls()` has changed.
+
+
+***************************************************************************
+
+
 # openxlsx2 0.2.1
 
 ## New features
@@ -142,6 +152,9 @@
   * `$append_sheet_rels()` for `self$worksheet_rels[[sheet]]`
   * `$get_worksheet()` to replace `$ws()`
 
+
+***************************************************************************
+
 # openxlsx2 0.2.0
 
 * Added a `NEWS.md` file to track changes to the package.

diff --git a/R/readWorkbook.R b/R/readWorkbook.R
@@ -3,8 +3,8 @@
 #' @description Read data from an Excel file or Workbook object into a data.frame
 #' @param xlsxFile An xlsx file, Workbook object or URL to xlsx file.
 #' @param sheet The name or index of the sheet to read data from.
-#' @param startRow first row to begin looking for data.  Empty rows at the top of a file are always skipped,
-#' regardless of the value of startRow.
+#' @param startRow first row to begin looking for data.
+#' @param startCol first column to begin looking for data.
 #' @param colNames If `TRUE`, the first row of data will be used as column names.
 #' @param skipEmptyRows If `TRUE`, empty rows are skipped else empty rows after the first row containing data
 #' will return a row of NAs.
@@ -69,13 +69,14 @@ read_xlsx <- function(
   xlsxFile,
   sheet,
   startRow        = 1,
-  colNames        = TRUE,
+  startCol        = NULL,
   rowNames        = FALSE,
-  detectDates     = TRUE,
+  colNames        = TRUE,
   skipEmptyRows   = FALSE,
   skipEmptyCols   = FALSE,
   rows            = NULL,
   cols            = NULL,
+  detectDates     = TRUE,
   namedRegion,
   na.strings      = "#N/A",
   na.numbers      = NA,
@@ -93,13 +94,14 @@ read_xlsx <- function(
     xlsxFile,
     sheet           = sheet,
     startRow        = startRow,
-    colNames        = colNames,
+    startCol        = startCol,
     rowNames        = rowNames,
-    detectDates     = detectDates,
+    colNames        = colNames,
     skipEmptyRows   = skipEmptyRows,
     skipEmptyCols   = skipEmptyCols,
     rows            = rows,
     cols            = cols,
+    detectDates     = detectDates,
     named_region    = namedRegion,
     na.strings      = na.strings,
     na.numbers      = na.numbers,
@@ -127,13 +129,14 @@ wb_read <- function(
   xlsxFile,
   sheet         = 1,
   startRow      = 1,
-  colNames      = TRUE,
+  startCol      = NULL,
   rowNames      = FALSE,
-  detectDates   = TRUE,
+  colNames      = TRUE,
   skipEmptyRows = FALSE,
   skipEmptyCols = FALSE,
   rows          = NULL,
   cols          = NULL,
+  detectDates   = TRUE,
   namedRegion,
   na.strings    = "NA",
   na.numbers    = NA
@@ -148,13 +151,14 @@ wb_read <- function(
     xlsxFile      = xlsxFile,
     sheet         = sheet,
     startRow      = startRow,
-    colNames      = colNames,
+    startCol      = startCol,
     rowNames      = rowNames,
-    detectDates   = detectDates,
+    colNames      = colNames,
     skipEmptyRows = skipEmptyRows,
     skipEmptyCols = skipEmptyCols,
     rows          = rows,
     cols          = cols,
+    detectDates   = detectDates,
     named_region  = namedRegion,
     na.strings    = na.strings,
     na.numbers    = na.numbers

diff --git a/R/wb_functions.R b/R/wb_functions.R
@@ -173,6 +173,7 @@ style_is_posix <- function(cellXfs, numfmt_date) {
 #' @param skipEmptyCols If TRUE, empty columns are skipped.
 #' @param skipEmptyRows If TRUE, empty rows are skipped.
 #' @param startRow first row to begin looking for data.
+#' @param startCol first column to begin looking for data.
 #' @param rows A numeric vector specifying which rows in the Excel file to read. If NULL, all rows are read.
 #' @param cols A numeric vector specifying which columns in the Excel file to read. If NULL, all columns are read.
 #' @param definedName (deprecated) Character string with a definedName. If no sheet is selected, the first appearance will be selected.
@@ -257,13 +258,14 @@ wb_to_df <- function(
   xlsxFile,
   sheet,
   startRow        = 1,
-  colNames        = TRUE,
+  startCol        = NULL,
   rowNames        = FALSE,
-  detectDates     = TRUE,
-  skipEmptyCols   = FALSE,
+  colNames        = TRUE,
   skipEmptyRows   = FALSE,
+  skipEmptyCols   = FALSE,
   rows            = NULL,
   cols            = NULL,
+  detectDates     = TRUE,
   na.strings      = "#N/A",
   na.numbers      = NA,
   fillMergedCells = FALSE,
@@ -376,6 +378,7 @@ wb_to_df <- function(
   keep_rows <- rownames(z)
 
   maxRow <- max(as.numeric(keep_rows))
+  maxCol <- max(col2int(keep_cols))
 
   if (startRow > 1) {
     keep_rows <- as.character(seq(startRow, maxRow))
@@ -407,6 +410,25 @@ wb_to_df <- function(
     }
   }
 
+  if (!is.null(startCol)) {
+    keep_cols <- int2col(seq(col2int(startCol), maxCol))
+
+    if (!all(keep_cols %in% colnames(z))) {
+      keep_col <- keep_cols[!keep_cols %in% colnames(z)]
+
+      z[keep_col] <- NA_character_
+      tt[keep_col] <- NA_character_
+
+      # return expected order of columns
+      z <- z[keep_cols]
+      tt <- tt[keep_cols]
+    }
+
+
+      z  <- z[, colnames(z) %in% keep_cols, drop = FALSE]
+      tt <- tt[, colnames(tt) %in% keep_cols, drop = FALSE]
+  }
+
   if (!is.null(cols)) {
     keep_cols <- int2col(cols)
 

diff --git a/man/read_xlsx.Rd b/man/read_xlsx.Rd
diff --git a/man/wb_read.Rd b/man/wb_read.Rd
diff --git a/man/wb_to_df.Rd b/man/wb_to_df.Rd
diff --git a/tests/testthat/test-read_from_created_wb.R b/tests/testthat/test-read_from_created_wb.R
@@ -111,3 +111,15 @@ test_that("dims != rows & cols", {
   expect_equal("4", rownames(got6))
 
 })
+
+test_that("read startCol", {
+
+  wb <- wb_workbook()$add_worksheet()$add_data(x = cars, startCol = "E")
+
+  got <- wb_to_df(wb, startCol = 1, colNames = FALSE)
+  expect_equal(LETTERS[1:6], names(got))
+
+  got <- wb_to_df(wb, startCol = "F", colNames = FALSE)
+  expect_equal(LETTERS[6], names(got))
+
+})