/
summarize_bins.R
86 lines (67 loc) · 2.33 KB
/
summarize_bins.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#' summarize bins
#'
#' Returns a summary of all bins created by `bin_cols` in a data frame. Takes no arguments other than the data frame
#' but relies on regular expressions based of the `bin_cols` output in order to identify the corresponding columns.
#'
#'
#' @param mdb dataframe output from bin_cols
#' @param ... optional tidyselect specification for specific cols
#'
#' @return a tibble
#' @export
#'
#' @examples
#'
#' iris %>%
#' bin_cols(Sepal.Width) %>%
#' bin_summary()
bin_summary <- function(mdb, ...){
column <- .rank <- .label <- NULL
mdb %>%
framecleaner::select_otherwise(...,
otherwise = tidyselect::everything(),
return_type = "names") %>%
enc2utf8() %>%
stringr::str_subset("_[a-z][a-z][0-9]*$") -> cols
if(rlang::is_empty(cols)){
rlang::abort("you only supplied columns that weren't created by bin_cols")
}
bucket_rgx <- stringr::str_c(cols, collapse = "|")
cols %>% rlang::syms() -> bucks
blist <- list()
for(buck in bucks){
mdb %>% dplyr::select(!!buck) %>% names() %>% enc2utf8() -> bnames
bnames %>% stringr::str_extract("[a-z][a-z][0-9]*$")-> suffix
suffix %>% stringr::str_remove("[0-9]*$") -> suffix_letter
suffix %>% stringr::str_remove("^[a-z][a-z]") -> suffix_number
bnames %>% stringr::str_remove("_[a-z][a-z][0-9]*$") %>% rlang::sym() -> org_col
switch(suffix_letter,
"wi" = "equal width",
"fr" = "equal freq",
"va" = "equal value",
"km" = "kmeans",
"xg" = "xgboost",
"ca" = "cart",
"wo" = "woe",
"lr" = "logistic regression",
"ci" = "caim",
"cc" = "cacc",
"am" = "ameva",
"ch" = "chi2",
"cm" = "chimerge",
"ec" = "extendedchi2",
"mh" = "modchi2",
"md" = "mdlp"
) -> method
mdb %>%
numeric_summary(original_col = !!org_col, bucket_col = !!buck) %>%
dplyr::mutate(column := rlang::as_name(org_col),
method = method,
n_bins = as.integer(suffix_number),
.before = 1) %>%
dplyr::rename_with(function(x)c(".rank"), c(4)) %>%
dplyr::mutate(.rank = as.integer(.rank)) -> mdb1
blist %>% rlist::list.append(mdb1) -> blist
}
blist %>% purrr::reduce(dplyr::bind_rows)
}