/
check.py
442 lines (389 loc) 路 18.5 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
"""
Check Module
"""
import copy
import numpy as np
import pandas as pd
from shapash.utils.category_encoder_backend import no_dummies_category_encoder, supported_category_encoder,\
dummies_category_encoder
from shapash.utils.columntransformer_backend import no_dummies_sklearn, supported_sklearn
from shapash.utils.model import extract_features_model
from shapash.utils.model_synoptic import dict_model_feature
from shapash.utils.transform import preprocessing_tolist, check_transformers
from shapash.utils.columntransformer_backend import columntransformer, get_feature_names, get_list_features_names
def check_preprocessing(preprocessing=None):
"""
Check that all transformation of the preprocessing are supported.
Parameters
----------
preprocessing: category_encoders, ColumnTransformer, list, dict, optional (default: None)
The processing apply to the original data
"""
if preprocessing is not None:
list_preprocessing = preprocessing_tolist(preprocessing)
use_ct, use_ce = check_transformers(list_preprocessing)
return use_ct, use_ce
def check_model(model):
"""
Check if model has a predict_proba method is a one column dataframe of integer or float
and if y_pred index matches x_init index
Parameters
----------
model: model object
model used to check the different values of target estimate predict or predict_proba
Returns
-------
string:
'regression' or 'classification' according to the attributes of the model
"""
_classes = None
if hasattr(model, 'predict'):
if hasattr(model, 'predict_proba') or \
any(hasattr(model, attrib) for attrib in ['classes_', '_classes']):
if hasattr(model, '_classes'): _classes = model._classes
if hasattr(model, 'classes_'): _classes = model.classes_
if isinstance(_classes, np.ndarray): _classes = _classes.tolist()
if hasattr(model, 'predict_proba') and _classes == []: _classes = [0, 1] # catboost binary
if hasattr(model, 'predict_proba') and _classes is None:
raise ValueError(
"No attribute _classes, classification model not supported"
)
if _classes not in (None, []):
return 'classification', _classes
else:
return 'regression', None
else:
raise ValueError(
"No method predict in the specified model. Please, check model parameter"
)
def check_label_dict(label_dict, case, classes=None):
"""
Check if label_dict and model _classes match
Parameters
----------
label_dict: dict
Dictionary mapping integer labels to domain names (classification - target values).
case: string
String that informs if the model used is for classification or regression problem.
classes: list, None
List of labels if the model used is for classification problem, None otherwise.
"""
if label_dict is not None and case == 'classification':
if set(classes) != set(list(label_dict.keys())):
raise ValueError(
"label_dict and don't match: \n" +
f"label_dict keys: {str(list(label_dict.keys()))}\n" +
f"Classes model values {str(classes)}"
)
def check_mask_params(mask_params):
"""
Check if mask_params given respect the expected format.
Parameters
----------
mask_params: dict (optional)
Dictionnary allowing the user to define a apply a filter to summarize the local explainability.
"""
if not isinstance(mask_params, dict):
raise ValueError(
"""
mask_params must be a dict
"""
)
else:
conform_arguments = ["features_to_hide", "threshold", "positive", "max_contrib"]
mask_arguments_not_conform = [argument for argument in mask_params.keys()
if argument not in conform_arguments]
if len(mask_arguments_not_conform) != 0:
raise ValueError(
"""
mask_params must only have the following key arguments:
-feature_to_hide
-threshold
-positive
-max_contrib
"""
)
def check_ypred(x=None, ypred=None):
"""
Check that ypred given has the right shape and expected value.
Parameters
----------
ypred: pandas.DataFrame (optional)
User-specified prediction values.
x: pandas.DataFrame
Dataset used by the model to perform the prediction (preprocessed or not).
"""
if ypred is not None:
if not isinstance(ypred, (pd.DataFrame, pd.Series)):
raise ValueError("y_pred must be a one column pd.Dataframe or pd.Series.")
if not ypred.index.equals(x.index):
raise ValueError("x and y_pred should have the same index.")
if isinstance(ypred, pd.DataFrame):
if ypred.shape[1] > 1:
raise ValueError("y_pred must be a one column pd.Dataframe or pd.Series.")
if not (ypred.dtypes[0] in [np.float, np.int, np.int32, np.float32, np.int64, np.float64]):
raise ValueError("y_pred must contain int or float only")
if isinstance(ypred, pd.Series):
if not (ypred.dtype in [np.float, np.int, np.int32, np.float32, np.int64, np.float64]):
raise ValueError("y_pred must contain int or float only")
ypred = ypred.to_frame()
if isinstance(ypred.columns[0], (np.int, np.float)):
ypred.columns = ["ypred"]
return ypred
def check_contribution_object(case, classes, contributions):
"""
Check len of list if _case is "classification"
Check contributions object type if _case is "regression"
Check type of contributions and transform into (list of) pd.Dataframe if necessary
Parameters
----------
case: string
String that informs if the model used is for classification or regression problem.
classes: list, None
List of labels if the model used is for classification problem, None otherwise.
contributions : pandas.DataFrame, np.ndarray or list
"""
if case == "regression" and isinstance(contributions, (np.ndarray, pd.DataFrame)) == False:
raise ValueError(
"""
Type of contributions parameter specified is not compatible with
regression model.
Please check model and contributions parameters.
"""
)
elif case == "classification":
if isinstance(contributions, list):
if len(contributions) != len(classes):
raise ValueError(
"""
Length of list of contributions parameter is not equal
to the number of classes in the target.
Please check model and contributions parameters.
"""
)
else:
raise ValueError(
"""
Type of contributions parameter specified is not compatible with
classification model.
Please check model and contributions parameters.
"""
)
def check_consistency_model_features(features_dict, model, columns_dict, features_types,
mask_params=None, preprocessing=None, postprocessing=None,
list_preprocessing=None, features_groups=None):
"""
Check the matching between attributes, features names are same, or include
Parameters
----------
features_dict: dict
Dictionary mapping technical feature names to domain names.
model: model object
model used to check the different values of target estimate predict_proba
columns_dict: dict
Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names.
features_types: dict
Dictionnary mapping features with the right types needed.
preprocessing: category_encoders, ColumnTransformer, list or dict (optional)
The processing apply to the original data
mask_params: dict (optional)
Dictionnary allowing the user to define a apply a filter to summarize the local explainability.
postprocessing : dict
Dictionnary of postprocessing that need to be checked.
list_preprocessing: list (optional)
list containing all preprocessing.
features_groups: list (optional)
list containing all groups of features.
"""
# Features dict can include additional entries for groups of features.
# We don't want to check them here as they may not be in other dict
features_dict = copy.deepcopy(features_dict)
if features_groups is not None:
for feat in features_groups.keys():
if feat in features_dict.keys():
features_dict.pop(feat)
if features_dict is not None:
if not all(feat in features_types for feat in features_dict):
raise ValueError("All features of features_dict must be in features_types")
if set(features_types) != set(columns_dict.values()):
raise ValueError("features of features_types and columns_dict must be the same")
if mask_params is not None:
if mask_params['features_to_hide'] is not None:
if not all(feature in set(features_types) for feature in mask_params['features_to_hide']):
raise ValueError("All features of mask_params must be in model")
if preprocessing is not None and str(type(preprocessing)) in (supported_category_encoder):
if not all(feature in set(columns_dict.values()) for feature in set(preprocessing.cols)):
raise ValueError("All features of preprocessing must be in columns_dict")
model_features = extract_features_model(model, dict_model_feature[str(type(model))])
if isinstance(model_features, list):
feature_expected_model = model_features
model_expected = len(set(model_features))
else:
feature_expected_model = None
model_expected = model_features
if preprocessing is None:
if isinstance(feature_expected_model, list):
if set(columns_dict.values()) != set(feature_expected_model):
columns_dict_feature = [str(feature) for feature in columns_dict.values()]
if set(columns_dict_feature) != set(feature_expected_model):
raise ValueError("Features of columns_dict and model must be the same.")
else:
if len(set(columns_dict.values())) != model_expected :
raise ValueError("Features of columns_dict and model must have the same length")
if str(type(preprocessing)) in supported_category_encoder and isinstance(feature_expected_model, list):
if set(preprocessing.feature_names) != set(feature_expected_model):
raise ValueError("""
One of features returned by the Category_Encoders preprocessing doesn't
match the model's expected features.
""")
elif preprocessing is not None:
feature_encoded = get_list_features_names(list_preprocessing, columns_dict)
if model_expected != len(feature_encoded):
raise ValueError("""
Number of features returned by the preprocessing step doesn't
match the model's expected features.
""")
if postprocessing:
if not isinstance(postprocessing, dict):
raise ValueError("Postprocessing parameter must be a dictionnary")
for feature in postprocessing.keys():
if feature not in features_types.keys():
raise ValueError("Postprocessing and features_types must have the same features names.")
if feature not in columns_dict.values():
raise ValueError("Postprocessing and columns_dict must have the same features names.")
check_postprocessing(features_types, postprocessing)
def check_preprocessing_options(columns_dict, features_dict, preprocessing=None, list_preprocessing=None):
"""
Check if preprocessing for ColumnTransformer doesn't have "drop" option otherwise compute several
informations to adapt the SmartPredictor's actions
Parameters
----------
preprocessing: category_encoders, ColumnTransformer, list or dict (optional)
The processing apply to the original data.
columns_dict: dict
Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names.
features_dict: dict
Dictionary mapping technical feature names to domain names.
list_preprocessing: list (optional)
list containing all preprocessing.
Returns
-------
None, dict
None if there isn't drop options in ColumnTransformer otherwise dict of informations to adapt.
"""
feature_to_drop = list()
if preprocessing is not None:
for enc in list_preprocessing:
if str(type(enc)) in columntransformer:
for options in enc.transformers_:
if "drop" in options:
feature_to_drop.extend(options[2])
if len(feature_to_drop) != 0:
feature_to_drop = [columns_dict[index] for index in feature_to_drop]
features_dict_op = {key: value for key, value in features_dict.items()
if key not in feature_to_drop}
i = 0
columns_dict_op = dict()
for value in columns_dict.values():
if value not in feature_to_drop:
columns_dict_op[i] = value
i += 1
return {"features_to_drop": feature_to_drop,
"features_dict_op": features_dict_op,
"columns_dict_op": columns_dict_op}
else:
return None
def check_consistency_model_label(columns_dict, label_dict=None):
"""
Check the matching between attributes, features names are same, or include
Parameters
----------
columns_dict: dict
Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names.
label_dict: dict (optional)
Dictionary mapping integer labels to domain names (classification - target values).
"""
if label_dict is not None:
if not all(feat in columns_dict for feat in label_dict):
raise ValueError("All features of label_dict must be in model")
def check_postprocessing(x, postprocessing=None):
"""
Check that postprocessing parameter has good attributes matching with x dataset or with dict of types of
the expected data set x
Parameters
----------
x: pandas.DataFrame, dict
Dataset x without preprocessing or dictionnary mapping features with the right types needed.
postprocessing : dict
Dictionnary of postprocessing that need to be checked.
"""
if postprocessing:
if not isinstance(postprocessing, dict):
raise ValueError("Postprocessing parameter must be a dictionnary")
for key in postprocessing.keys():
dict_post = postprocessing[key]
if not isinstance(dict_post, dict):
raise ValueError(f"{key} values must be a dict")
if not list(dict_post.keys()) == ['type', 'rule']:
raise ValueError("Wrong postprocessing keys, you need 'type' and 'rule' keys")
if not dict_post['type'] in ['prefix', 'suffix', 'transcoding', 'regex', 'case']:
raise ValueError("Wrong postprocessing method. \n"
"The available methods are: 'prefix', 'suffix', 'transcoding', 'regex', or 'case'")
if dict_post['type'] == 'case':
if dict_post['rule'] not in ['lower', 'upper']:
raise ValueError("Case modification unknown. Available ones are 'lower', 'upper'.")
if isinstance(x, dict):
if x[key] != "object":
raise ValueError(f"Expected string object to modify with upper/lower method in {key} dict")
else:
if not pd.api.types.is_string_dtype(x[key]):
raise ValueError(f"Expected string object to modify with upper/lower method in {key} dict")
if dict_post['type'] == 'regex':
if not set(dict_post['rule'].keys()) == {'in', 'out'}:
raise ValueError(f"Regex modifications for {key} are not possible, the keys in 'rule' dict"
f" must be 'in' and 'out'.")
if isinstance(x,dict):
if x[key] != "object":
raise ValueError(f"Expected string object to modify with regex methods in {key} dict")
else:
if not pd.api.types.is_string_dtype(x[key]):
raise ValueError(f"Expected string object to modify with upper/lower method in {key} dict")
def check_features_name(columns_dict, features_dict, features):
"""
Convert a list of feature names (string) or features ids into features ids.
Features names can be part of columns_dict or features_dict.
Parameters
----------
features : List
List of ints (columns ids) or of strings (business names)
columns_dict: dict
Dictionary mapping integer column number to technical feature names.
features_dict: dict
Dictionary mapping technical feature names to domain names.
Returns
-------
list of ints
Columns ids compatible with var_dict
"""
if all(isinstance(f, int) for f in features):
features_ids = features
elif all(isinstance(f, str) for f in features):
inv_columns_dict = {v: k for k, v in columns_dict.items()}
inv_features_dict = {v: k for k, v in features_dict.items()}
if features_dict and all(f in features_dict.values() for f in features):
columns_list = [inv_features_dict[f] for f in features]
features_ids = [inv_columns_dict[c] for c in columns_list]
elif inv_columns_dict and all(f in columns_dict.values() for f in features):
features_ids = [inv_columns_dict[f] for f in features]
else:
raise ValueError(
'All features must came from the same dict of features (technical names or domain names).'
)
else:
raise ValueError(
"""
features must be a list of ints (representing ids of columns)
or a list of string from technical features names or from domain names.
"""
)
return features_ids