add fix for #145 (#147)

ModelOriented · Jan 13, 2023 · 6c16680 · 6c16680
1 parent 38a8826
commit 6c16680
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ingredients
 Title: Effects and Importances of Model Ingredients
-Version: 2.2.1
+Version: 2.3.0
 Authors@R: c(person("Przemyslaw", "Biecek", email = "przemyslaw.biecek@gmail.com",
                   role = c("aut", "cre"),
                   comment = c(ORCID = "0000-0001-8423-1823")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+ingredients 2.3.0
+--------------------------------------------------------------
+* breaking change: `calculate_variable_splits()` now treats `integer` variables as `categorical`. This change is propagated to `ceteris_paribus()`, `partial_dependence()`, `accumulated_dependence()`, `conditional_dependence()`, `aggregate_profiles()`, `DALEX::predict_profile()`, `DALEX::model_profile()`
+* fix an error in `ceteris_paribus` / `calculate_variable_splits` when `tidymodels` uses `integer` variables [#145](https://github.com/ModelOriented/ingredients/issues/145)
+
 ingredients 2.2.1
 --------------------------------------------------------------
 * added `facet_scales` parameter to `plot.aggregated_profiles_explainer` (`'free_x'` by default) [#138](https://github.com/ModelOriented/ingredients/issues/138) and `plot.ceteris_paribus_explainer` (`'free_x'` or `'free_y'` by default, depending on plot type) [#136](https://github.com/ModelOriented/ingredients/issues/136)

diff --git a/R/calculate_variable_profile.R b/R/calculate_variable_profile.R
@@ -81,8 +81,8 @@ calculate_variable_split <- function(data, variables = colnames(data), grid_poin
 calculate_variable_split.default <- function(data, variables = colnames(data), grid_points = 101, variable_splits_type = "quantiles", new_observation = NA) {
   variable_splits <- lapply(variables, function(var) {
     selected_column <- na.omit(data[,var])
-    # numeric?
-    if (is.numeric(selected_column)) {
+    # as per ?is.numeric :  `is.numeric(x)` equals `is.double(x) || is.integer(x)`
+    if (is.double(selected_column)) {
       probs <- seq(0, 1, length.out = grid_points)
       if (variable_splits_type == "quantiles") {
         # variable quantiles
@@ -93,7 +93,14 @@ calculate_variable_split.default <- function(data, variables = colnames(data), g
       # fixing https://github.com/ModelOriented/ingredients/issues/124
       if (!any(is.na(new_observation)))
         selected_splits <- sort(unique(c(selected_splits, na.omit(new_observation[,var]))))
-    } else {
+    } else { # categorical OR integer fix for https://github.com/ModelOriented/ingredients/issues/145
+
+      if (length(unique(selected_column)) > 201) warning(
+        paste0("Variable: < ", var, " > has more than 201 unique values and all of them will be used as variable splits in calculating variable profiles.",
+               " Use the `variable_splits` parameter to mannualy change this behaviour.",
+               " If you believe this warning to be a false positive, raise issue at <https://github.com/ModelOriented/ingredients/issues>.")
+      )
+
       # sort will change order of factors in a good way
       if (any(is.na(new_observation))) {
         selected_splits <- sort(unique(selected_column))