Skip to content

Commit

Permalink
Init #36, todo: support specific columns
Browse files Browse the repository at this point in the history
  • Loading branch information
Nelson-Gon committed Feb 27, 2021
1 parent 3e2633a commit 4e69193
Show file tree
Hide file tree
Showing 14 changed files with 212 additions and 62 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ output: html_document

# mde 0.3.1

* `drop_na_if` now supports dropping groups that meet a set missing data
threshold.

* `recode_as_na_str` updated to convert factors to character.

# mde 0.3.0
Expand Down
83 changes: 64 additions & 19 deletions R/drop_na_if.R
Original file line number Diff line number Diff line change
@@ -1,50 +1,95 @@
#' Condition based dropping of columns with missing values
#' @description "drop_na_if" provides a simple way to drop columns with missing values if
#' @importFrom dplyr droup_by, filter, ungroup, across, everything, mutate
#' @description "drop_na_if" provides a simple way to drop columns with missing
#' values if
#' they meet certain criteria/conditions.
#' @param df A data.frame object
#' @param percent_na The percentage to use when dropping columns with missing values
#' @param percent_na The percentage to use when dropping columns with missing
#' values
#' @param grouping_cols For dropping groups that meet a target criterion of
#' percent missingness.
#' @param target_columns If working on grouped data, drop all columns that meet
#' target or only a specific column.
#' @param ... Other arguments to "percent_missing"
#' @param keep_columns Columns that should be kept despite meeting the target percent_na criterion(criteria)
#' @param sign Character. One of gteq,lteq,lt,gt or eq which refer to greater than(gt) or equal(eq) or
#' @param keep_columns Columns that should be kept despite meeting the target
#' percent_na
#' criterion(criteria)
#' @param sign Character. One of gteq,lteq,lt,gt or eq which refer to greater
#' than(gt) or equal(eq) or
#' less than(lt) or equal to(eq) respectively.
#' @return A data.frame object with columns that meet the target criteria dropped
#' @return A data.frame object with columns that meet the target criteria
#' dropped.
#' @seealso \code{\link{percent_missing}}
#' @examples
#' head(drop_na_if(airquality, percent_na = 24))
#' #drop columns that have less tan or equal to 4%
#' head(drop_na_if(airquality,sign="lteq", percent_na = 4))
#' # Drop all except with greater than oe equal to 4% missing but keep Ozone
#' head(drop_na_if(airquality, sign="gteq",percent_na = 4, keep_columns = "Ozone"))
#' # Drop all except with greater than ie equal to 4% missing but keep Ozone
#' head(drop_na_if(airquality, sign="gteq",percent_na = 4,
#' keep_columns = "Ozone"))
#' # Drop groups that meet a given criterion
#' grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"), Vals = c(4, NA,
#' NA, NA, NA), Values = c(5, 6, 7, 8, NA)), row.names = c(NA, -5L),
#' class = "data.frame")
#' drop_na_if(grouped_drop,percent_na = 67,grouping_cols = "ID")
#' @export

drop_na_if <- function(df, sign="gteq",percent_na= 50,keep_columns=NULL,...){
drop_na_if <- function(df, sign="gteq",percent_na= 50,
keep_columns=NULL,
grouping_cols=NULL,
target_columns=NULL,...){
UseMethod("drop_na_if")
}

#' @export

drop_na_if.data.frame <- function(df,sign="gteq",percent_na=50,keep_columns = NULL, ...){
drop_na_if.data.frame <- function(df, sign="gteq",percent_na= 50,
keep_columns=NULL,
grouping_cols=NULL,
target_columns=NULL,...){




# get percent missing
missing_percents <- percent_missing(df,...)
# Drop as required

to_drop <- switches(target_value=missing_percents,sign,percent_na)
# If we have grouping columns, use those instead
if(!is.null(grouping_cols)){
check_column_existence(df, grouping_cols,"to group by")

if(!is.null(keep_columns)){
# Drop everything for now.
#use_column <- if(is.null(target_column)) everything() else
#!!!dplyr::syms(target_column)

df %>%
dplyr::group_by(!!!dplyr::syms(grouping_cols)) %>%
dplyr::filter(across(everything(), ~!switches(mean(is.na(.)) * 100,
sign=sign,
percent_na=percent_na))) %>%
dplyr::ungroup()
}

check_column_existence(df,target_columns = keep_columns, unique_name = "to keep")

keep_columns <- which(names(df) %in% keep_columns)

else{

# get percent missing
missing_percents <- percent_missing(df,...)
#Drop as required
to_drop <- switches(target_value=missing_percents,sign,percent_na)
if(!is.null(keep_columns)){

check_column_existence(df,target_columns = keep_columns,
unique_name = "to keep")

keep_columns <- which(names(df) %in% keep_columns)

}
to_drop <- setdiff(to_drop, keep_columns)

if(length(to_drop) ==0) return(df)

df[-to_drop]
}

}


2 changes: 1 addition & 1 deletion R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ res<- switch(sign,
res
}


switches.double <- switches.numeric
unexpected_argument <- function(arg, acceptable_values){

if(!arg %in% acceptable_values){
Expand Down
24 changes: 15 additions & 9 deletions R/percent_missing.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
#' Columnwise missingness percentages
#' @param df A valid R `object` for which the percentage of missing values is required.
#' @param exclude_cols A character vector indicating columns to exclude when returning results.
#' @description A convenient way to obtain percent missingness columnwise.
#' Column-wise missingness percentages
#' @param df A valid R `object` for which the percentage of missing values is
#' required.
#' @param exclude_cols A character vector indicating columns to exclude when
#' returning results.
#' @description A convenient way to obtain percent missingness column-wise.
#' @inheritParams get_na_counts
#' @return An object of the same class as x showing the percentage of missing values.
#' @return An object of the same class as x showing the percentage of missing
#' values.
#' @examples
#' test <- data.frame(ID= c("A","B","A","B","A","B","A"), Vals = c(NA,25,34,NA,67,NA,45))
#' test <- data.frame(ID= c("A","B","A","B","A","B","A"),
#' Vals = c(NA,25,34,NA,67,NA,45))
#' percent_missing(test,grouping_cols = "ID")
#' percent_missing(airquality)
#' percent_missing(airquality,exclude_cols = c("Day","Temp"))
#' @export

percent_missing <- function(df, grouping_cols = NULL,exclude_cols = NULL){
percent_missing <- function(df, grouping_cols = NULL,exclude_cols = NULL){
UseMethod("percent_missing")
}

#' @export
percent_missing.data.frame <- function(df, grouping_cols = NULL,exclude_cols = NULL){
percent_missing.data.frame <- function(df, grouping_cols = NULL,
exclude_cols = NULL){


if(!is.null(grouping_cols)){
Expand All @@ -37,7 +42,8 @@ check_column_existence(df, exclude_cols, "to exclude")

}

df %>% dplyr::summarise(across(everything(),~ get_na_means(.))) %>% dplyr::ungroup()
df %>% dplyr::summarise(across(everything(),
~ get_na_means(.))) %>% dplyr::ungroup()

}

Expand Down
8 changes: 5 additions & 3 deletions R/recode_na_if.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# Grouped data recoding
# Currently less flexible and somewhat repititive
# Currently less flexible and somewhat repetitive
# Need to use functions we already have to do the replacements
#' Recode NA as another value with some conditions
#' @param df A data.frame object with missing values
#' @param grouping_cols Character columns to use for grouping the data
#' @param target_groups Character Recode NA as if and only if the grouping column is
#' @param target_groups Character Recode NA as if and only if the grouping
#' column is
#' in this vector of values
#' @importFrom dplyr .data
#' @param replacement Values to use to replace NAs for IDs that meet the requirements.
#' @param replacement Values to use to replace NAs for IDs that meet the
#' requirements.
#' Defaults to 0.
#' @examples
#' some_data <- data.frame(ID=c("A1","A2","A3", "A4"),
Expand Down
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mde: Missing Data Explorer
================
2021-02-26
2021-02-27

<!-- badges: start -->

Expand Down Expand Up @@ -646,8 +646,24 @@ head(drop_na_if(airquality, percent_na = 24))
#> 6 NA 14.9 66 5 6
```

For more information, please see the documentation for `drop_na_if`
especially for grouping support.
To drop groups that meet a set missingness criterion, we proceed as
follows.

``` r
grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"),
Vals = c(4, NA, NA, NA, NA), Values = c(5, 6, 7, 8, NA)),
row.names = c(NA, -5L), class = "data.frame")
# Drop all columns for groups that meet a percent missingness of reater than or
# equal to 67
drop_na_if(grouped_drop,percent_na = 67,sign="gteq",
grouping_cols = "ID")
#> # A tibble: 3 x 3
#> ID Vals Values
#> <chr> <dbl> <dbl>
#> 1 A 4 5
#> 2 A NA 6
#> 3 A NA 8
```

- `drop_row_if`

Expand Down
13 changes: 12 additions & 1 deletion README.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,18 @@ head(drop_na_if(airquality, percent_na = 24))
```

For more information, please see the documentation for `drop_na_if` especially for grouping support.
To drop groups that meet a set missingness criterion, we proceed as follows.

```{r}
grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"),
Vals = c(4, NA, NA, NA, NA), Values = c(5, 6, 7, 8, NA)),
row.names = c(NA, -5L), class = "data.frame")
# Drop all columns for groups that meet a percent missingness of reater than or
# equal to 67
drop_na_if(grouped_drop,percent_na = 67,sign="gteq",
grouping_cols = "ID")
```



* `drop_row_if`
Expand Down
32 changes: 23 additions & 9 deletions docs/articles/mde_vignette.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 4e69193

Please sign in to comment.