-
Notifications
You must be signed in to change notification settings - Fork 16
/
individual_variable_effect.R
209 lines (191 loc) · 7.46 KB
/
individual_variable_effect.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#' @title Individual Variable Effect
#'
#' @param x a model to be explained, or an explainer created with function \code{\link[DALEX]{explain}}.
#' @param data validation dataset. Used to determine univariate distributions, calculation of quantiles,
#' correlations and so on. It will be extracted from `x` if it’s an explainer.
#' @param predict_function predict function that operates on the model `x`. Since the model is a black box,
#' the `predict_function` is the only interface to access values from the model. It should be a function that
#' takes at least a model `x` and data and returns vector of predictions. If model response has more than
#' a single number (like multiclass models) then this function should return a marix/data.frame of the size
#' `m` x `d`, where `m` is the number of observations while `d` is the dimensionality of model response.
#' It will be extracted from `x` if it’s an explainer.
#' @param new_observation an observation/observations to be explained. Required for local/instance level
#' explainers. Columns in should correspond to columns in the data argument. Data set should not contain any additional columns.
#' @param ... other parameters.
#' @param label name of the model. By default it’s extracted from the class attribute of the model
#' @param method an estimation method of SHAP values. Currently the only availible is `KernelSHAP`.
#' @param nsamples number of samples or "auto". Note that number must be as integer. Use `as.integer()`.
#'
#'
#'
#' @return an object of class individual_variable_effect with shap values of each variable for each new observation.
#' Columns:
#' \itemize{
#' \item first d columns contains variable values.
#' \item _id_ - id of observation, number of row in `new_observation` data.
#' \item _ylevel_ - level of y
#' \item _yhat_ -predicted value for level of y
#' \item _yhat_mean_ - expected value of prediction, mean of all predictions
#' \item _vname_ - variable name
#' \item _attribution_ - attribution of variable
#' \item _sign_ a sign of attribution
#' \item _label_ a label
#' }
#'
#' In order to use shapper with other python virtual environment following R command are required to execute
#' reticulate::use_virtualenv("path_to_your_env")
#' or for conda
#' reticulate::use_conda("name_of_conda_env")
#' before attaching shapper.
#'
#' @importFrom reticulate r_to_py
#'
#' @examples
#' have_shap <- reticulate::py_module_available("shap")
#'
#' if(have_shap){
#' library("shapper")
#' library("DALEX")
#' library("randomForest")
#' Y_train <- HR$status
#' x_train <- HR[ , -6]
#' set.seed(123)
#' model_rf <- randomForest(x = x_train, y = Y_train, ntree= 50)
#' p_function <- function(model, data) predict(model, newdata = data, type = "prob")
#'
#' ive_rf <- individual_variable_effect(model_rf, data = x_train, predict_function = p_function,
#' new_observation = x_train[1:2,], nsamples = 50)
#' ive_rf
#' } else{
#' print('Python testing environment is required.')
#' }
#'
#'
#' @export
#' @aliases shap
#'
#' @rdname individual_variable_effect
individual_variable_effect <- function(x, ...) {
UseMethod("individual_variable_effect")
}
#' @export
#' @rdname individual_variable_effect
individual_variable_effect.explainer <- function(x,
new_observation,
method = "KernelSHAP",
nsamples = "auto",
...) {
# extracts model, data and predict function from the explainer
model <- x$model
data <- x$data
predict_function <- x$predict_function
label <- x$label
individual_variable_effect.default(
model,
data,
predict_function,
new_observation = new_observation,
label = label,
method = method,
nsamples = nsamples,
...
)
}
#' @importFrom utils tail
#' @export
#' @rdname individual_variable_effect
individual_variable_effect.default <-
function(x,
data,
predict_function = predict,
new_observation,
label = tail(class(x), 1),
method = "KernelSHAP",
nsamples = "auto",
...) {
# check if data correct
if(!all(colnames(data)==colnames(new_observation))){
stop("Columns in new obseravtion and data does not match")
}
# transform factors to numerics and keep factors' levels
data_classes <- sapply(data, class)
factors <- list()
data_numeric <- data
for (col in names(data_classes)) {
if (data_classes[col] == "factor") {
factors[[col]] <- levels(data[, col])
data_numeric[, col] <- as.numeric(data_numeric[, col]) - 1
}
}
# force nsamples to be an integer
if (is.numeric(nsamples))
nsamples <- as.integer(round(nsamples))
p_function <- function(new_data) {
new_data <- as.data.frame(new_data)
colnames(new_data) <- colnames(data)
for (col in names(factors)) {
new_data[, col] <- factor(new_data[, col],
levels = c(0:(length(factors[[col]]) - 1)),
labels = factors[[col]])
}
res <- as.data.frame(predict_function(x, new_data))
if (nrow(res) == 1) {
res[2, ] <- 0
res <- r_to_py(res)
res$drop(res$index[1], inplace = TRUE)
}
return(res)
}
explainer = shap_reference$KernelExplainer(p_function, data_numeric)
new_observation_releveled <- new_observation
new_observation_numeric <- new_observation
for (col in names(factors)) {
new_observation_releveled[, col] <-
factor(new_observation_releveled[, col], levels = factors[[col]])
new_observation_numeric[, col] <-
as.numeric(new_observation_releveled[, col]) - 1
}
shap_values = explainer$shap_values(new_observation_numeric, nsamples = nsamples)
expected_value = explainer$expected_value
predictions <- predict_function(x, new_observation_releveled)
variables <- colnames(data)
# create data to return
new_data <- new_observation
new_data$`_id_` <- c(1:nrow(new_data))
# add multiple predictions
new_data <-
new_data[rep(1:nrow(new_data), each = length(shap_values)),]
if (is.null(colnames(predictions))) {
new_data$`_ylevel_` <- ""
new_data <- unique(new_data)
} else {
new_data$`_ylevel_` <-
rep(colnames(predictions), times = nrow(new_observation))
}
new_data$`_yhat_` <- as.vector(t(predictions))
new_data$`_yhat_mean_` <-
rep(expected_value, times = nrow(new_observation))
# add multiple variables
new_data <- new_data[rep(1:nrow(new_data), each = ncol(data)),]
new_data$`_vname_` <- rep(variables, times = length(predictions))
attribution <- numeric()
for (i in 1:nrow(new_observation)) {
for (j in 1:length(shap_values)) {
shap_attributes <- shap_values[[j]]
if (is.matrix(shap_attributes)) {
attribution <- c(attribution, shap_attributes[i, ])
} else {
attribution <- c(attribution, shap_attributes[i])
}
}
}
new_data$`_attribution_` <- attribution
new_data$`_sign_` <- factor(sign(new_data$`_attribution_`))
new_data$`_sign_` <- ifelse(new_data$`_sign_` == 1, "+", "-")
new_data$`_label_` <- label
class(new_data) <- c("individual_variable_effect", "data.frame")
return(new_data)
}
#' @export
#' @rdname individual_variable_effect
shap <- individual_variable_effect