Init #36, todo: support specific columns

Nelson-Gon · Feb 27, 2021 · 4e69193 · 4e69193
1 parent 3e2633a
commit 4e69193
Show file tree

Hide file tree

Showing 14 changed files with 212 additions and 62 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -7,6 +7,9 @@ output: html_document
 
 # mde 0.3.1
 
+* `drop_na_if` now supports dropping groups that meet a set missing data
+threshold. 
+
 * `recode_as_na_str` updated to convert factors to character.
 
 # mde 0.3.0

diff --git a/R/drop_na_if.R b/R/drop_na_if.R
@@ -1,50 +1,95 @@
 #' Condition based dropping of columns with missing values
-#' @description "drop_na_if" provides a simple way to drop columns with missing values if
+#' @importFrom dplyr droup_by, filter, ungroup, across, everything, mutate
+#' @description "drop_na_if" provides a simple way to drop columns with missing 
+#' values if
 #' they meet certain criteria/conditions.
 #' @param df A data.frame object
-#' @param percent_na The percentage to use when dropping columns with missing values
+#' @param percent_na The percentage to use when dropping columns with missing 
+#' values
+#' @param grouping_cols For dropping groups that meet a target criterion of 
+#' percent missingness. 
+#' @param target_columns If working on grouped data, drop all columns that meet 
+#' target or only a specific column. 
 #' @param ... Other arguments to "percent_missing"
-#' @param keep_columns Columns that should be kept despite meeting the target percent_na criterion(criteria)
-#' @param sign Character. One of gteq,lteq,lt,gt or eq which refer to greater than(gt) or equal(eq) or
+#' @param keep_columns Columns that should be kept despite meeting the target 
+#' percent_na 
+#' criterion(criteria)
+#' @param sign Character. One of gteq,lteq,lt,gt or eq which refer to greater 
+#' than(gt) or equal(eq) or
 #' less than(lt) or equal to(eq) respectively.
-#' @return A data.frame object with columns  that meet the target criteria dropped
+#' @return A data.frame object with columns  that meet the target criteria 
+#' dropped. 
 #' @seealso \code{\link{percent_missing}}
 #' @examples
 #' head(drop_na_if(airquality, percent_na = 24))
 #' #drop columns that have less tan or equal to 4%
 #' head(drop_na_if(airquality,sign="lteq", percent_na = 4))
-#' # Drop all except with greater than oe equal to 4% missing but keep Ozone
-#' head(drop_na_if(airquality, sign="gteq",percent_na = 4, keep_columns = "Ozone"))
+#' # Drop all except with greater than ie equal to 4% missing but keep Ozone
+#' head(drop_na_if(airquality, sign="gteq",percent_na = 4, 
+#' keep_columns = "Ozone"))
+#' # Drop groups that meet a given criterion
+#' grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"), Vals = c(4, NA, 
+#' NA, NA, NA), Values = c(5, 6, 7, 8, NA)), row.names = c(NA, -5L),
+#'  class = "data.frame")
+#'  drop_na_if(grouped_drop,percent_na = 67,grouping_cols = "ID")
 #' @export
 
-drop_na_if <- function(df, sign="gteq",percent_na= 50,keep_columns=NULL,...){
+drop_na_if <- function(df, sign="gteq",percent_na= 50,
+                       keep_columns=NULL,
+                       grouping_cols=NULL, 
+                       target_columns=NULL,...){
   UseMethod("drop_na_if")
 }
 
 #' @export
 
-drop_na_if.data.frame <- function(df,sign="gteq",percent_na=50,keep_columns  = NULL, ...){
+drop_na_if.data.frame <- function(df, sign="gteq",percent_na= 50,
+                                  keep_columns=NULL,
+                                  grouping_cols=NULL, 
+                                  target_columns=NULL,...){
+
+
 
 
-# get percent missing
-missing_percents <- percent_missing(df,...)
-# Drop as required
 
-to_drop <- switches(target_value=missing_percents,sign,percent_na)
+# If we have grouping columns, use those instead
+if(!is.null(grouping_cols)){
+check_column_existence(df, grouping_cols,"to group by")
 
-if(!is.null(keep_columns)){
+# Drop everything for now.
+#use_column <- if(is.null(target_column)) everything() else 
+#!!!dplyr::syms(target_column)
+
+df %>% 
+    dplyr::group_by(!!!dplyr::syms(grouping_cols)) %>% 
+    dplyr::filter(across(everything(), ~!switches(mean(is.na(.)) * 100,
+                                    sign=sign,
+                                    percent_na=percent_na))) %>% 
+    dplyr::ungroup()
+}
 
-check_column_existence(df,target_columns = keep_columns, unique_name = "to keep")
 
-keep_columns <- which(names(df) %in% keep_columns)
 
+else{
+
+# get percent missing
+missing_percents <- percent_missing(df,...)
+#Drop as required
+to_drop <- switches(target_value=missing_percents,sign,percent_na)
+if(!is.null(keep_columns)){
+
+check_column_existence(df,target_columns = keep_columns, 
+                       unique_name = "to keep")
+
+keep_columns <- which(names(df) %in% keep_columns)
+
 }
 to_drop <- setdiff(to_drop, keep_columns)
-
+  
 if(length(to_drop) ==0) return(df)
-
+  
 df[-to_drop]
 }
-
+}
 
 
diff --git a/R/helpers.R b/R/helpers.R
@@ -182,7 +182,7 @@ res<- switch(sign,
 res
 }
 
-
+switches.double <- switches.numeric
 unexpected_argument <- function(arg, acceptable_values){
 
   if(!arg %in% acceptable_values){

diff --git a/R/percent_missing.R b/R/percent_missing.R
@@ -1,22 +1,27 @@
-#' Columnwise missingness percentages
-#' @param df A valid R `object` for which the percentage of missing values is required.
-#' @param exclude_cols A character vector indicating columns to exclude when returning results.
-#' @description A convenient way to obtain percent missingness columnwise.
+#' Column-wise missingness percentages
+#' @param df A valid R `object` for which the percentage of missing values is 
+#' required.
+#' @param exclude_cols A character vector indicating columns to exclude when 
+#' returning results.
+#' @description A convenient way to obtain percent missingness column-wise.
 #' @inheritParams get_na_counts
-#' @return An object of the same class as x showing the percentage of missing values.
+#' @return An object of the same class as x showing the percentage of missing 
+#' values.
 #' @examples
-#' test <- data.frame(ID= c("A","B","A","B","A","B","A"), Vals = c(NA,25,34,NA,67,NA,45))
+#' test <- data.frame(ID= c("A","B","A","B","A","B","A"), 
+#' Vals = c(NA,25,34,NA,67,NA,45))
 #' percent_missing(test,grouping_cols = "ID")
 #' percent_missing(airquality)
 #' percent_missing(airquality,exclude_cols = c("Day","Temp"))
 #' @export
 
-percent_missing <- function(df,  grouping_cols = NULL,exclude_cols = NULL){
+percent_missing <- function(df, grouping_cols = NULL,exclude_cols = NULL){
   UseMethod("percent_missing")
 }
 
 #' @export
-percent_missing.data.frame <- function(df,  grouping_cols = NULL,exclude_cols = NULL){
+percent_missing.data.frame <- function(df,  grouping_cols = NULL,
+                                       exclude_cols = NULL){
 
 
 if(!is.null(grouping_cols)){
@@ -37,7 +42,8 @@ check_column_existence(df, exclude_cols, "to exclude")
 
 }
 
-  df %>% dplyr::summarise(across(everything(),~ get_na_means(.))) %>% dplyr::ungroup()
+  df %>% dplyr::summarise(across(everything(),
+                                 ~ get_na_means(.))) %>% dplyr::ungroup()
 
 }
 

diff --git a/R/recode_na_if.R b/R/recode_na_if.R
@@ -1,13 +1,15 @@
 # Grouped data recoding
-# Currently less flexible and somewhat repititive
+# Currently less flexible and somewhat repetitive
 # Need to use functions we already have to do the replacements
 #' Recode NA as another value with some conditions
 #' @param df A data.frame object with missing values
 #' @param grouping_cols Character columns to use for grouping the data
-#' @param target_groups Character Recode NA as if and only if the grouping column is
+#' @param target_groups Character Recode NA as if and only if the grouping 
+#' column is
 #' in this vector of values
 #' @importFrom dplyr .data
-#' @param replacement Values to use to replace NAs for IDs that meet the requirements.
+#' @param replacement Values to use to replace NAs for IDs that meet the 
+#' requirements.
 #' Defaults to 0.
 #' @examples
 #' some_data <- data.frame(ID=c("A1","A2","A3", "A4"),

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 mde: Missing Data Explorer
 ================
-2021-02-26
+2021-02-27
 
 <!-- badges: start -->
 
@@ -646,8 +646,24 @@ head(drop_na_if(airquality, percent_na = 24))
 #> 6      NA 14.9   66     5   6
 ```
 
-For more information, please see the documentation for `drop_na_if`
-especially for grouping support.
+To drop groups that meet a set missingness criterion, we proceed as
+follows.
+
+``` r
+grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"), 
+          Vals = c(4, NA,  NA, NA, NA), Values = c(5, 6, 7, 8, NA)), 
+          row.names = c(NA, -5L), class = "data.frame")
+# Drop all columns for groups that meet a percent missingness of reater than or
+# equal to 67
+drop_na_if(grouped_drop,percent_na = 67,sign="gteq",
+                                    grouping_cols = "ID")
+#> # A tibble: 3 x 3
+#>   ID     Vals Values
+#>   <chr> <dbl>  <dbl>
+#> 1 A         4      5
+#> 2 A        NA      6
+#> 3 A        NA      8
+```
 
   - `drop_row_if`
 

diff --git a/README.rmd b/README.rmd
@@ -480,7 +480,18 @@ head(drop_na_if(airquality, percent_na = 24))
 
 ```
 
-For more information, please see the documentation for `drop_na_if` especially for grouping support.
+To drop groups that meet a set missingness criterion, we proceed as follows.
+
+```{r}
+grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"), 
+          Vals = c(4, NA,  NA, NA, NA), Values = c(5, 6, 7, 8, NA)), 
+          row.names = c(NA, -5L), class = "data.frame")
+# Drop all columns for groups that meet a percent missingness of reater than or
+# equal to 67
+drop_na_if(grouped_drop,percent_na = 67,sign="gteq",
+                                    grouping_cols = "ID")
+```
+
 
 
 * `drop_row_if`

diff --git a/docs/articles/mde_vignette.html b/docs/articles/mde_vignette.html