From 6c16680f993e931d75c97f39944695f2b3b6f14d Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Fri, 13 Jan 2023 21:55:07 +0100 Subject: [PATCH] add fix for #145 (#147) --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/calculate_variable_profile.R | 13 ++++++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8397924..265332e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ingredients Title: Effects and Importances of Model Ingredients -Version: 2.2.1 +Version: 2.3.0 Authors@R: c(person("Przemyslaw", "Biecek", email = "przemyslaw.biecek@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-8423-1823")), diff --git a/NEWS.md b/NEWS.md index 5edbc42..0bb5345 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +ingredients 2.3.0 +-------------------------------------------------------------- +* breaking change: `calculate_variable_splits()` now treats `integer` variables as `categorical`. This change is propagated to `ceteris_paribus()`, `partial_dependence()`, `accumulated_dependence()`, `conditional_dependence()`, `aggregate_profiles()`, `DALEX::predict_profile()`, `DALEX::model_profile()` +* fix an error in `ceteris_paribus` / `calculate_variable_splits` when `tidymodels` uses `integer` variables [#145](https://github.com/ModelOriented/ingredients/issues/145) + ingredients 2.2.1 -------------------------------------------------------------- * added `facet_scales` parameter to `plot.aggregated_profiles_explainer` (`'free_x'` by default) [#138](https://github.com/ModelOriented/ingredients/issues/138) and `plot.ceteris_paribus_explainer` (`'free_x'` or `'free_y'` by default, depending on plot type) [#136](https://github.com/ModelOriented/ingredients/issues/136) diff --git a/R/calculate_variable_profile.R b/R/calculate_variable_profile.R index e60c97e..beee22d 100644 --- a/R/calculate_variable_profile.R +++ b/R/calculate_variable_profile.R @@ -81,8 +81,8 @@ calculate_variable_split <- function(data, variables = colnames(data), grid_poin calculate_variable_split.default <- function(data, variables = colnames(data), grid_points = 101, variable_splits_type = "quantiles", new_observation = NA) { variable_splits <- lapply(variables, function(var) { selected_column <- na.omit(data[,var]) - # numeric? - if (is.numeric(selected_column)) { + # as per ?is.numeric : `is.numeric(x)` equals `is.double(x) || is.integer(x)` + if (is.double(selected_column)) { probs <- seq(0, 1, length.out = grid_points) if (variable_splits_type == "quantiles") { # variable quantiles @@ -93,7 +93,14 @@ calculate_variable_split.default <- function(data, variables = colnames(data), g # fixing https://github.com/ModelOriented/ingredients/issues/124 if (!any(is.na(new_observation))) selected_splits <- sort(unique(c(selected_splits, na.omit(new_observation[,var])))) - } else { + } else { # categorical OR integer fix for https://github.com/ModelOriented/ingredients/issues/145 + + if (length(unique(selected_column)) > 201) warning( + paste0("Variable: < ", var, " > has more than 201 unique values and all of them will be used as variable splits in calculating variable profiles.", + " Use the `variable_splits` parameter to mannualy change this behaviour.", + " If you believe this warning to be a false positive, raise issue at .") + ) + # sort will change order of factors in a good way if (any(is.na(new_observation))) { selected_splits <- sort(unique(selected_column))