From acc14d9e15edb35c0b0d465b5a398a9be8b13a32 Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Fri, 7 Aug 2020 13:38:09 +0200 Subject: [PATCH 01/11] update readme --- README.md | 2 +- python/dalex/README.md | 2 +- python/dalex/setup.cfg | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 python/dalex/setup.cfg diff --git a/README.md b/README.md index 9d7595cc5..7560cc4e2 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ install.packages("DALEX") The **Python** version of dalex is available on [pip](https://pypi.org/project/dalex/) ```console -pip install dalex +pip install dalex -U ``` ## Learn more diff --git a/python/dalex/README.md b/python/dalex/README.md index ee9fd90f4..32a7b80a5 100644 --- a/python/dalex/README.md +++ b/python/dalex/README.md @@ -27,7 +27,7 @@ The `dalex` package is a part of [DrWhy.AI](http://DrWhy.AI) universe. ## Installation ```console -pip install dalex==0.1.9 +pip install dalex -U ``` ## Resources diff --git a/python/dalex/setup.cfg b/python/dalex/setup.cfg new file mode 100644 index 000000000..b88034e41 --- /dev/null +++ b/python/dalex/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md From 727e0629da32ed9cf82858de769faf725b19dd4a Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 15:23:45 +0200 Subject: [PATCH 02/11] add ResidualDiagnostics --- python/dalex/dalex/_explainer/object.py | 26 +++++- python/dalex/dalex/dataset_level/__init__.py | 4 +- .../_residual_diagnostics/__init__.py | 5 ++ .../_residual_diagnostics/checks.py | 17 ++++ .../_residual_diagnostics/object.py | 81 +++++++++++++++++++ 5 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 python/dalex/dalex/dataset_level/_residual_diagnostics/__init__.py create mode 100644 python/dalex/dalex/dataset_level/_residual_diagnostics/checks.py create mode 100644 python/dalex/dalex/dataset_level/_residual_diagnostics/object.py diff --git a/python/dalex/dalex/_explainer/object.py b/python/dalex/dalex/_explainer/object.py index 308f2bbd7..fa37c51bb 100644 --- a/python/dalex/dalex/_explainer/object.py +++ b/python/dalex/dalex/_explainer/object.py @@ -1,4 +1,5 @@ -from dalex.dataset_level import ModelPerformance, VariableImportance, AggregatedProfiles +from dalex.dataset_level import ModelPerformance, VariableImportance,\ + AggregatedProfiles, ResidualDiagnostics from dalex.instance_level import BreakDown, Shap, CeterisParibus from .checks import * from .helper import get_model_info @@ -544,6 +545,29 @@ def model_profile(self, return model_profile_ + def model_diagnostics(self, + variables=None): + """Calculate dataset level residuals diagnostics + + Parameters + ----------- + variables : str or array_like of str, optional + Variables for which the data will be calculated + (default is None, which means all of the variables). + + Returns + ----------- + ResidualDiagnostics class object + Explanation object containing the main result attribute and the plot method. + """ + + residual_diagnostics_ = ResidualDiagnostics( + variables=variables + ) + residual_diagnostics_.fit(self) + + return residual_diagnostics_ + def dumps(self, *args, **kwargs): """Return the pickled representation (bytes object) of the Explainer diff --git a/python/dalex/dalex/dataset_level/__init__.py b/python/dalex/dalex/dataset_level/__init__.py index 9e16f72e8..eda835bb5 100644 --- a/python/dalex/dalex/dataset_level/__init__.py +++ b/python/dalex/dalex/dataset_level/__init__.py @@ -1,9 +1,11 @@ from ._aggregated_profiles.object import AggregatedProfiles from ._model_performance.object import ModelPerformance from ._variable_importance.object import VariableImportance +from ._residual_diagnostics import ResidualDiagnostics __all__ = [ "ModelPerformance", "VariableImportance", - "AggregatedProfiles" + "AggregatedProfiles", + "ResidualDiagnostics" ] diff --git a/python/dalex/dalex/dataset_level/_residual_diagnostics/__init__.py b/python/dalex/dalex/dataset_level/_residual_diagnostics/__init__.py new file mode 100644 index 000000000..eaa3ec58a --- /dev/null +++ b/python/dalex/dalex/dataset_level/_residual_diagnostics/__init__.py @@ -0,0 +1,5 @@ +from .object import ResidualDiagnostics + +__all__ = [ + "ResidualDiagnostics" +] diff --git a/python/dalex/dalex/dataset_level/_residual_diagnostics/checks.py b/python/dalex/dalex/dataset_level/_residual_diagnostics/checks.py new file mode 100644 index 000000000..890b778a6 --- /dev/null +++ b/python/dalex/dalex/dataset_level/_residual_diagnostics/checks.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd + + +def check_variables(variables): + # treating variables as list simplifies code + if variables is not None and not isinstance(variables, (str, list, np.ndarray, pd.Series)): + raise TypeError("variables must be None or str or list or np.ndarray or pd.Series") + + if variables is None: + variables_ = None + elif isinstance(variables, str): + variables_ = [variables] + else: + variables_ = list(variables) + + return variables_ \ No newline at end of file diff --git a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py new file mode 100644 index 000000000..ed32f47f2 --- /dev/null +++ b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py @@ -0,0 +1,81 @@ +import plotly.graph_objects as go + +from .checks import * + + +class ResidualDiagnostics: + """Calculate dataset level residuals diagnostics + + Parameters + ----------- + variables : str or array_like of str, optional + Variables for which the profiles will be calculated + (default is None, which means all of the variables). + + Attributes + ----------- + result : pd.DataFrame + Main result attribute of an explanation. + variables : array_like of str or None + Variables for which the profiles will be calculated + + Notes + -------- + https://pbiecek.github.io/ema/residualDiagnostic.html + """ + def __init__(self, + variables=None): + + variables_ = check_variables(variables) + + self.result = None + self.variables = variables_ + + def fit(self, explainer): + """Calculate the result of explanation + + Fit method makes calculations in place and changes the attributes. + + Parameters + ----------- + explainer : Explainer object + Model wrapper created using the Explainer class. + + Returns + ----------- + None + """ + result = explainer.data.copy() + + # if variables = NULL then all variables are added + # otherwise only selected + if self.variables is not None: + result = result.loc[:, np.intersect1d(self.variables, result.columns)] + + # is there target + if explainer.y is not None: + result = result.assign(y=explainer.y) + + # are there predictions + #:# add y_hat to the Explainer for the future + if explainer.y_hat is None: + explainer.y_hat = explainer.predict(explainer.data) + + result = result.assign(y_hat=explainer.y_hat) + + # are there residuals + #:# add residuals to the Explainer for the future + if explainer.residuals is None: + explainer.residuals = explainer.residual(explainer.data, explainer.y) + + result = result.assign( + y_hat=explainer.y_hat, + residuals=explainer.residuals, + abs_residuals=np.abs(explainer.residuals), + label=explainer.label, + ids=np.arange(result.shape[0])+1 + ) + self.result = result + + def plot(self, objects, variable="y_hat", yvariable="residuals", smooth=True): + pass \ No newline at end of file From fe555d952eed3fdccd4d0c757c9b6a6c7b5d5d1e Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 18:02:58 +0200 Subject: [PATCH 03/11] add predict_surrogate --- python/dalex/NEWS.md | 7 +- python/dalex/dalex/_explainer/checks.py | 28 ++++++++ python/dalex/dalex/_explainer/helper.py | 25 +++++++ python/dalex/dalex/_explainer/object.py | 68 ++++++++++++++++++- .../_residual_diagnostics/object.py | 2 +- python/dalex/setup.py | 3 +- 6 files changed, 129 insertions(+), 4 deletions(-) diff --git a/python/dalex/NEWS.md b/python/dalex/NEWS.md index 11dd953a6..692fd4e3e 100644 --- a/python/dalex/NEWS.md +++ b/python/dalex/NEWS.md @@ -1,6 +1,11 @@ dalex (development) ---------------------------------------------------------------- -* ... + +### features + +* added `model_diagnostics` method to the `Explainer`, which performs residual diagnostics +* added `predict_surrogate` method to the `Explainer`, which is a wrapper for the `lime` + tabular explanation from the [lime](https://github.com/marcotcr/lime) package dalex 0.2.0 ---------------------------------------------------------------- diff --git a/python/dalex/dalex/_explainer/checks.py b/python/dalex/dalex/_explainer/checks.py index c82285142..947fa9741 100644 --- a/python/dalex/dalex/_explainer/checks.py +++ b/python/dalex/dalex/_explainer/checks.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +from copy import deepcopy from .helper import verbose_cat, is_y_in_data from .yhat import * @@ -290,3 +291,30 @@ def check_loss_function(explainer, loss_function): def check_model_type(model_type, model_type_): return model_type_ if model_type is None else model_type + + +def check_new_observation_lime(new_observation): + # lime accepts only np.array as data_row + + new_observation_ = deepcopy(new_observation) + if isinstance(new_observation_, pd.Series): + new_observation_ = new_observation_.to_numpy() + elif isinstance(new_observation_, np.ndarray): + if new_observation_.ndim == 2: + if new_observation.shape[0] != 1: + raise ValueError("Wrong new_observation dimension") + # make 2D array 1D + new_observation_ = new_observation_.flatten() + elif new_observation_.ndim > 2: + raise ValueError("Wrong new_observation dimension") + elif isinstance(new_observation_, list): + new_observation_ = np.array(new_observation_) + elif isinstance(new_observation_, pd.DataFrame): + if new_observation.shape[0] != 1: + raise ValueError("Wrong new_observation dimension") + else: + new_observation_ = new_observation.to_numpy().flatten() + else: + raise TypeError("new_observation must be a list or numpy.ndarray or pandas.Series or pandas.DataFrame") + + return new_observation_ diff --git a/python/dalex/dalex/_explainer/helper.py b/python/dalex/dalex/_explainer/helper.py index 7c79c3e8a..260b50c58 100644 --- a/python/dalex/dalex/_explainer/helper.py +++ b/python/dalex/dalex/_explainer/helper.py @@ -15,3 +15,28 @@ def is_y_in_data(data, y): def get_model_info(model): model_package = re.search("(?<==1.0.1', 'numpy>=1.18.1', 'plotly>=4.9.0', - 'tqdm>=4.42.1' + 'tqdm>=4.42.1', + 'lime>=0.2.0.1' ], packages=setuptools.find_packages(include=["dalex", "dalex.*"]), python_requires='>=3.6', From 5944ef6db3409fe092f7f71af4a0ece6d916e016 Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 18:12:45 +0200 Subject: [PATCH 04/11] upgrade the predict_fn selection --- python/dalex/dalex/_explainer/helper.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/dalex/dalex/_explainer/helper.py b/python/dalex/dalex/_explainer/helper.py index 260b50c58..05ce53d88 100644 --- a/python/dalex/dalex/_explainer/helper.py +++ b/python/dalex/dalex/_explainer/helper.py @@ -36,7 +36,12 @@ def unpack_kwargs_lime(explainer, new_observation, **kwargs): if 'data_row' not in explanation_dict: explanation_dict['data_row'] = new_observation if 'predict_fn' not in explanation_dict: - explanation_dict['predict_fn'] =\ - explainer.model.predict if explainer.model_type == 'regression' else explainer.model.predict_proba + if hasattr(explainer.model, 'predict_proba'): + explanation_dict['predict_fn'] = explainer.model.predict_proba + elif hasattr(explainer.model, 'predict'): + explanation_dict['predict_fn'] = explainer.model.predict + else: + raise ValueError("Pass a `predict_fn` parameter to the `predict_surrogate` method. " + "See https://lime-ml.readthedocs.io/en/latest/lime.html#lime.lime_tabular.LimeTabularExplainer.explain_instance") return explainer_dict, explanation_dict From 06e07ba3270dc44ece4c1e6cecf80fab52c7172a Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 20:23:25 +0200 Subject: [PATCH 05/11] add plot to ResidualDiagnostics --- python/dalex/NEWS.md | 5 ++ .../_model_performance/object.py | 23 +++-- .../_residual_diagnostics/object.py | 88 +++++++++++++++++-- python/dalex/setup.py | 8 +- 4 files changed, 105 insertions(+), 19 deletions(-) diff --git a/python/dalex/NEWS.md b/python/dalex/NEWS.md index 692fd4e3e..7584116bc 100644 --- a/python/dalex/NEWS.md +++ b/python/dalex/NEWS.md @@ -1,8 +1,13 @@ dalex (development) ---------------------------------------------------------------- +### bug fixes + +* `ModelPerformance.plot` now uses a drwhy color palette + ### features +* added the `ResidualDiagnostics` object with a `plot` method * added `model_diagnostics` method to the `Explainer`, which performs residual diagnostics * added `predict_surrogate` method to the `Explainer`, which is a wrapper for the `lime` tabular explanation from the [lime](https://github.com/marcotcr/lime) package diff --git a/python/dalex/dalex/dataset_level/_model_performance/object.py b/python/dalex/dalex/dataset_level/_model_performance/object.py index 9a950727e..eda9b2eeb 100644 --- a/python/dalex/dalex/dataset_level/_model_performance/object.py +++ b/python/dalex/dalex/dataset_level/_model_performance/object.py @@ -2,6 +2,7 @@ from dalex.dataset_level._model_performance.plot import ecdf from .utils import * +from ..._explainer.theme import get_default_colors class ModelPerformance: @@ -121,7 +122,7 @@ def plot(self, Parameters ----------- objects : ModelPerformance object or array_like of ModelPerformance objects - Additional objects to plot in subplots (default is None). + Additional objects to plot (default is None). title : str, optional Title of the plot (default depends on the `type` attribute). show : bool, optional @@ -136,31 +137,29 @@ def plot(self, # are there any other objects to plot? if objects is None: - n = 1 - _residuals_df_list = [self.residuals.copy()] + _df_list = [self.residuals.copy()] elif isinstance(objects, self.__class__): # allow for objects to be a single element - n = 2 - _residuals_df_list = [self.residuals.copy(), objects.residuals.copy()] + _df_list = [self.residuals.copy(), objects.residuals.copy()] else: # objects as tuple or array - n = len(objects) + 1 - _residuals_df_list = [self.residuals.copy()] + _df_list = [self.residuals.copy()] for ob in objects: if not isinstance(ob, self.__class__): raise TypeError("Some explanations aren't of ModelPerformance class") - _residuals_df_list += [ob.residuals.copy()] + _df_list += [ob.residuals.copy()] + colors = get_default_colors(len(_df_list), 'line') fig = go.Figure() - for i in range(n): - _residuals_df = _residuals_df_list[i] - _abs_residuals = np.abs(_residuals_df['residuals']) + for i, _df in enumerate(_df_list): + _abs_residuals = np.abs(_df['residuals']) _unique_abs_residuals = np.unique(_abs_residuals) fig.add_scatter( x=_unique_abs_residuals, y=1 - ecdf(_abs_residuals)(_unique_abs_residuals), line_shape='hv', - name=_residuals_df.iloc[0, _residuals_df.columns.get_loc('label')] + name=_df.iloc[0, _df.columns.get_loc('label')], + marker=dict(color=colors[i]) ) fig.update_yaxes({'type': 'linear', 'gridwidth': 2, 'zeroline': False, 'automargin': True, 'ticks': 'outside', diff --git a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py index 9aff7fee0..6f804017d 100644 --- a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py +++ b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py @@ -1,6 +1,7 @@ -import plotly.graph_objects as go +import plotly.express as px from .checks import * +from ..._explainer.theme import get_default_colors class ResidualDiagnostics: @@ -68,14 +69,91 @@ def fit(self, explainer): if explainer.residuals is None: explainer.residuals = explainer.residual(explainer.data, explainer.y) - result = result.assign( + self.result = result.assign( y_hat=explainer.y_hat, residuals=explainer.residuals, abs_residuals=np.abs(explainer.residuals), label=explainer.label, ids=np.arange(result.shape[0])+1 ) - self.result = result - def plot(self, objects, variable="y_hat", yvariable="residuals", smooth=True): - pass + def plot(self, + objects, + variable="y_hat", + yvariable="residuals", + smooth=True, + line_width=2, + marker_size=3, + title="Residual Diagnostics", + show=True): + """Plot the Residual Diagnostics explanation + + Parameters + ---------- + objects : ResidualDiagnostics object or array_like of ResidualDiagnostics objects + Additional objects to plot (default is None). + variable : str, optional + Name of the variable from the `result` attribute to appear on the OX axis + (default is 'y_hat'). + yvariable : str, optional + Name of the variable from the `result` attribute to appear on the OY axis + (default is 'residuals'). + smooth : bool, optional + Add the smooth line (default is True). + line_width : float, optional + Width of lines in px (default is 2). + marker_size : float, optional + Size of points (default is 3). + title : str, optional + Title of the plot (default depends on the `type` attribute). + show : bool, optional + True shows the plot; False returns the plotly Figure object that can be + edited or saved using the `write_image()` method (default is True). + + Returns + ----------- + None or plotly.graph_objects.Figure + Return figure that can be edited or saved. See `show` parameter. + """ + + # are there any other objects to plot? + if objects is None: + _df_list = [self.result.copy()] + elif isinstance(objects, self.__class__): # allow for objects to be a single element + _df_list = [self.result.copy(), objects.result.copy()] + else: # objects as tuple or array + _df_list = [self.result.copy()] + for ob in objects: + if not isinstance(ob, self.__class__): + raise TypeError("Some explanations aren't of ResidualDiagnostics class") + _df_list += [ob.result.copy()] + + fig = px.scatter(pd.concat(_df_list), + x=variable, + y=yvariable, + color="label", + trendline="lowess" if smooth else None, + color_discrete_sequence=get_default_colors(len(_df_list), 'line')) \ + .update_traces(dict(marker_size=marker_size, line_width=line_width)) + + # wait for https://github.com/plotly/plotly.py/pull/2558 to add hline to the plot + + fig.update_yaxes({'type': 'linear', 'gridwidth': 2, 'zeroline': False, 'automargin': True, 'ticks': 'outside', + 'tickcolor': 'white', 'ticklen': 10, 'fixedrange': True, 'title_text': yvariable}) + + fig.update_xaxes({'type': 'linear', 'gridwidth': 2, 'zeroline': False, 'automargin': True, 'ticks': "outside", + 'tickcolor': 'white', 'ticklen': 10, 'fixedrange': True, 'title_text': variable}) + + fig.update_layout(title_text=title, title_x=0.15, font={'color': "#371ea3"}, template="none", + margin={'t': 78, 'b': 71, 'r': 30}) + + if show: + fig.show(config={'displaylogo': False, 'staticPlot': False, + 'toImageButtonOptions': {'height': None, 'width': None, }, + 'modeBarButtonsToRemove': ['sendDataToCloud', 'lasso2d', 'autoScale2d', 'select2d', + 'zoom2d', + 'pan2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', + 'toggleSpikelines', 'hoverCompareCartesian', + 'hoverClosestCartesian']}) + else: + return fig diff --git a/python/dalex/setup.py b/python/dalex/setup.py index 9b5bfee0d..3987a8e65 100644 --- a/python/dalex/setup.py +++ b/python/dalex/setup.py @@ -11,6 +11,7 @@ version="0.2.0", author="Wojciech Kretowicz, Hubert Baniecki, Przemyslaw Biecek", author_email="wojtekkretowicz@gmail.com, hbaniecki@gmail.com", + version="0.2.0.9000", description="DALEX in Python", long_description=long_description, long_description_content_type="text/markdown", @@ -31,8 +32,11 @@ 'pandas>=1.0.1', 'numpy>=1.18.1', 'plotly>=4.9.0', - 'tqdm>=4.42.1', - 'lime>=0.2.0.1' + 'tqdm>=4.42.1' + ], + test_requirements=[ + 'lime>=0.2.0.1', # Explainer.predict_surrogate + 'statsmodels>=0.11.1' # LOWESS trendlines in ResidualDiagnostics.plot ], packages=setuptools.find_packages(include=["dalex", "dalex.*"]), python_requires='>=3.6', From dc41d63619a78191276a38b84d6e59412082b97a Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 20:57:26 +0200 Subject: [PATCH 06/11] add print method --- python/dalex/NEWS.md | 1 + .../dalex/dalex/dataset_level/_aggregated_profiles/object.py | 4 ++++ python/dalex/dalex/dataset_level/_model_performance/object.py | 4 ++++ .../dalex/dalex/dataset_level/_residual_diagnostics/object.py | 4 ++++ .../dalex/dalex/dataset_level/_variable_importance/object.py | 4 ++++ python/dalex/dalex/instance_level/_break_down/object.py | 4 ++++ python/dalex/dalex/instance_level/_ceteris_paribus/object.py | 4 ++++ python/dalex/dalex/instance_level/_shap/object.py | 4 ++++ python/dalex/setup.py | 1 - 9 files changed, 29 insertions(+), 1 deletion(-) diff --git a/python/dalex/NEWS.md b/python/dalex/NEWS.md index 7584116bc..20457697e 100644 --- a/python/dalex/NEWS.md +++ b/python/dalex/NEWS.md @@ -11,6 +11,7 @@ dalex (development) * added `model_diagnostics` method to the `Explainer`, which performs residual diagnostics * added `predict_surrogate` method to the `Explainer`, which is a wrapper for the `lime` tabular explanation from the [lime](https://github.com/marcotcr/lime) package +* added a `__str__` method to all of the explanation objects (it prints the `result` attribute) dalex 0.2.0 ---------------------------------------------------------------- diff --git a/python/dalex/dalex/dataset_level/_aggregated_profiles/object.py b/python/dalex/dalex/dataset_level/_aggregated_profiles/object.py index e28c47fdd..f7f7a5af1 100644 --- a/python/dalex/dalex/dataset_level/_aggregated_profiles/object.py +++ b/python/dalex/dalex/dataset_level/_aggregated_profiles/object.py @@ -89,6 +89,10 @@ def __init__(self, self.raw_profiles = None self.random_state = random_state + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, ceteris_paribus, verbose=True): diff --git a/python/dalex/dalex/dataset_level/_model_performance/object.py b/python/dalex/dalex/dataset_level/_model_performance/object.py index eda9b2eeb..118369952 100644 --- a/python/dalex/dalex/dataset_level/_model_performance/object.py +++ b/python/dalex/dalex/dataset_level/_model_performance/object.py @@ -40,6 +40,10 @@ def __init__(self, self.result = None self.residuals = None + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, explainer): """Calculate the result of explanation diff --git a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py index 6f804017d..934998a03 100644 --- a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py +++ b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py @@ -32,6 +32,10 @@ def __init__(self, self.result = None self.variables = variables_ + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, explainer): """Calculate the result of explanation diff --git a/python/dalex/dalex/dataset_level/_variable_importance/object.py b/python/dalex/dalex/dataset_level/_variable_importance/object.py index ad60bea0b..0429125f7 100644 --- a/python/dalex/dalex/dataset_level/_variable_importance/object.py +++ b/python/dalex/dalex/dataset_level/_variable_importance/object.py @@ -98,6 +98,10 @@ def __init__(self, self.permutation = None self.processes = processes_ + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, explainer): """Calculate the result of explanation diff --git a/python/dalex/dalex/instance_level/_break_down/object.py b/python/dalex/dalex/instance_level/_break_down/object.py index a3e66c495..1aede867b 100644 --- a/python/dalex/dalex/instance_level/_break_down/object.py +++ b/python/dalex/dalex/instance_level/_break_down/object.py @@ -60,6 +60,10 @@ def __init__(self, self.result = None self.yhats_distributions = None + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, explainer, new_observation): diff --git a/python/dalex/dalex/instance_level/_ceteris_paribus/object.py b/python/dalex/dalex/instance_level/_ceteris_paribus/object.py index 34a8d0bb8..4c9a40941 100644 --- a/python/dalex/dalex/instance_level/_ceteris_paribus/object.py +++ b/python/dalex/dalex/instance_level/_ceteris_paribus/object.py @@ -78,6 +78,10 @@ def __init__(self, self.new_observation = None self.processes = processes_ + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, explainer, new_observation, diff --git a/python/dalex/dalex/instance_level/_shap/object.py b/python/dalex/dalex/instance_level/_shap/object.py index b27bc596e..f01caa2d7 100644 --- a/python/dalex/dalex/instance_level/_shap/object.py +++ b/python/dalex/dalex/instance_level/_shap/object.py @@ -71,6 +71,10 @@ def __init__(self, self.processes = processes_ self.random_state = random_state_ + def __str__(self): + from IPython.display import display + display(self.result) + def fit(self, explainer, new_observation): diff --git a/python/dalex/setup.py b/python/dalex/setup.py index 3987a8e65..9d4821091 100644 --- a/python/dalex/setup.py +++ b/python/dalex/setup.py @@ -8,7 +8,6 @@ setuptools.setup( name="dalex", - version="0.2.0", author="Wojciech Kretowicz, Hubert Baniecki, Przemyslaw Biecek", author_email="wojtekkretowicz@gmail.com, hbaniecki@gmail.com", version="0.2.0.9000", From e3ab9237b667c05b439d7ba1ba6178685916eeb0 Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 21:16:15 +0200 Subject: [PATCH 07/11] add tests to model_diagnostics --- python/dalex/test/test_model_diagnostics.py | 81 +++++++++++++++++++++ tox.ini | 2 + 2 files changed, 83 insertions(+) create mode 100644 python/dalex/test/test_model_diagnostics.py diff --git a/python/dalex/test/test_model_diagnostics.py b/python/dalex/test/test_model_diagnostics.py new file mode 100644 index 000000000..f0efbedc5 --- /dev/null +++ b/python/dalex/test/test_model_diagnostics.py @@ -0,0 +1,81 @@ +import unittest + +import numpy as np +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder + +import dalex as dx +from plotly.graph_objs import Figure + + +class ModelDiagnosticsTestTitanic(unittest.TestCase): + def setUp(self): + data = dx.datasets.load_titanic() + data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) + + self.X = data.drop(columns='survived') + self.y = data.survived + + numeric_features = ['age', 'fare', 'sibsp', 'parch'] + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler())]) + + categorical_features = ['gender', 'class', 'embarked'] + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)]) + + clf = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), + max_iter=400, random_state=0))]) + + clf.fit(self.X, self.y) + + self.exp = dx.Explainer(clf, self.X, self.y, verbose=False) + self.exp2 = dx.Explainer(clf, self.X, self.y, label="model2", verbose=False) + + def test_constructor(self): + case1 = self.exp.model_diagnostics() + self.assertIsInstance(case1, (dx.dataset_level.ResidualDiagnostics,)) + self.assertIsInstance(case1.result, (pd.DataFrame,)) + self.assertEqual(case1.result.shape[0], self.exp.data.shape[0]) + self.assertTrue(np.isin(['y', 'y_hat', 'residuals', 'abs_residuals', 'label', 'ids'], + case1.result.columns).all()) + + case2 = self.exp.model_diagnostics(variables=['age', 'class']) + self.assertIsInstance(case2, (dx.dataset_level.ResidualDiagnostics,)) + self.assertIsInstance(case2.result, (pd.DataFrame,)) + self.assertEqual(case2.result.shape[0], self.exp.data.shape[0]) + self.assertTrue(np.isin(['y', 'y_hat', 'residuals', 'abs_residuals', 'label', 'ids', 'age', 'gender'], + case2.result.columns).all()) + self.assertFalse(np.isin(['fare', 'sibsp', 'gender', 'embarked'], case2.result.columns).any()) + + def test_plot(self): + + case1 = self.exp.model_diagnostics() + case2 = self.exp.model_diagnostics(variables=['fare', 'embarked']) + + self.assertIsInstance(case1, dx.dataset_level.ResidualDiagnostics) + self.assertIsInstance(case2, dx.dataset_level.ResidualDiagnostics) + + fig1 = case1.plot(title="test1", show=False) + fig2 = case2.plot(case1, variable="abs_residuals", yvariable="y", show=False) + fig3 = case2.plot(smooth=False, line_width=6, marker_size=1, variable="fare") + + self.assertIsInstance(fig1, Figure) + self.assertIsInstance(fig2, Figure) + self.assertIsInstance(fig3, Figure) + + +if __name__ == '__main__': + unittest.main() diff --git a/tox.ini b/tox.ini index ac05912be..3d02bc9ba 100644 --- a/tox.ini +++ b/tox.ini @@ -23,3 +23,5 @@ commands = discover deps = discover scikit-learn + lime + statsmodels From 293d017f498100e3570cec8864dd681f8001242f Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 21:39:57 +0200 Subject: [PATCH 08/11] optimize tests --- .../_residual_diagnostics/object.py | 13 ++--- python/dalex/test/test_aggregated_profiles.py | 2 +- python/dalex/test/test_ceteris_paribus.py | 1 - python/dalex/test/test_model_performance.py | 22 ++++----- python/dalex/test/test_predict_surrogate | 49 +++++++++++++++++++ python/dalex/test/test_variable_importance.py | 16 +++--- 6 files changed, 72 insertions(+), 31 deletions(-) create mode 100644 python/dalex/test/test_predict_surrogate diff --git a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py index 934998a03..834e52830 100644 --- a/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py +++ b/python/dalex/dalex/dataset_level/_residual_diagnostics/object.py @@ -56,20 +56,13 @@ def fit(self, explainer): # otherwise only selected if self.variables is not None: result = result.loc[:, np.intersect1d(self.variables, result.columns)] - # is there target if explainer.y is not None: result = result.assign(y=explainer.y) - - # are there predictions - #:# add y_hat to the Explainer for the future + # are there predictions - add y_hat to the Explainer for the future if explainer.y_hat is None: explainer.y_hat = explainer.predict(explainer.data) - - result = result.assign(y_hat=explainer.y_hat) - - # are there residuals - #:# add residuals to the Explainer for the future + # are there residuals - add residuals to the Explainer for the future if explainer.residuals is None: explainer.residuals = explainer.residual(explainer.data, explainer.y) @@ -82,7 +75,7 @@ def fit(self, explainer): ) def plot(self, - objects, + objects=None, variable="y_hat", yvariable="residuals", smooth=True, diff --git a/python/dalex/test/test_aggregated_profiles.py b/python/dalex/test/test_aggregated_profiles.py index b4d94c9ff..8872000cf 100644 --- a/python/dalex/test/test_aggregated_profiles.py +++ b/python/dalex/test/test_aggregated_profiles.py @@ -14,7 +14,7 @@ import dalex as dx -class APTestTitanic(unittest.TestCase): +class AggregatedProfilesTestTitanic(unittest.TestCase): def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) diff --git a/python/dalex/test/test_ceteris_paribus.py b/python/dalex/test/test_ceteris_paribus.py index 3830df8e2..83235e797 100644 --- a/python/dalex/test/test_ceteris_paribus.py +++ b/python/dalex/test/test_ceteris_paribus.py @@ -14,7 +14,6 @@ class CeterisParibusTestTitanic(unittest.TestCase): - def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) diff --git a/python/dalex/test/test_model_performance.py b/python/dalex/test/test_model_performance.py index 8c902316a..8dd180f7d 100644 --- a/python/dalex/test/test_model_performance.py +++ b/python/dalex/test/test_model_performance.py @@ -45,17 +45,17 @@ def setUp(self): self.exp2 = dx.Explainer(clf, self.X, self.y, label="model2", verbose=False) def test_constructor(self): - self.assertIsInstance(self.exp.model_performance('classification'), (dx.dataset_level.ModelPerformance,)) - self.assertIsInstance(self.exp.model_performance('classification').result, (pd.DataFrame,)) - self.assertEqual(self.exp.model_performance('classification').result.shape[0], 1) - self.assertTrue(np.isin(['recall', 'precision', 'f1', 'accuracy', 'auc'], - self.exp.model_performance('classification').result.columns).all()) - - self.assertIsInstance(self.exp.model_performance('regression'), (dx.dataset_level.ModelPerformance,)) - self.assertIsInstance(self.exp.model_performance('regression').result, (pd.DataFrame,)) - self.assertEqual(self.exp.model_performance('regression').result.shape[0], 1) - self.assertTrue(np.isin(['mse', 'rmse', 'r2', 'mae', 'mad'], - self.exp.model_performance('regression').result.columns).all()) + case1 = self.exp.model_performance('classification') + self.assertIsInstance(case1, (dx.dataset_level.ModelPerformance,)) + self.assertIsInstance(case1.result, (pd.DataFrame,)) + self.assertEqual(case1.result.shape[0], 1) + self.assertTrue(np.isin(['recall', 'precision', 'f1', 'accuracy', 'auc'], case1.result.columns).all()) + + case2 = self.exp.model_performance('regression') + self.assertIsInstance(case2, (dx.dataset_level.ModelPerformance,)) + self.assertIsInstance(case2.result, (pd.DataFrame,)) + self.assertEqual(case2.result.shape[0], 1) + self.assertTrue(np.isin(['mse', 'rmse', 'r2', 'mae', 'mad'], case2.result.columns).all()) def test_plot(self): diff --git a/python/dalex/test/test_predict_surrogate b/python/dalex/test/test_predict_surrogate new file mode 100644 index 000000000..aeadde4e6 --- /dev/null +++ b/python/dalex/test/test_predict_surrogate @@ -0,0 +1,49 @@ +import unittest + +import numpy as np +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.neural_network import MLPRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder + +import dalex as dx + + +class PredictSurrogateTestTitanic(unittest.TestCase): + def setUp(self): + data = dx.datasets.load_titanic() + data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) + + self.X = data.drop(columns='survived') + self.y = data.survived + + numeric_features = ['age', 'fare', 'sibsp', 'parch'] + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler())]) + + categorical_features = ['gender', 'class', 'embarked'] + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)]) + + clf = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', MLPRegressor(hidden_layer_sizes=(150, 100, 50), + max_iter=500, random_state=0))]) + + clf.fit(self.X, self.y) + + self.exp = dx.Explainer(clf, self.X, self.y, verbose=False) + + def test(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/dalex/test/test_variable_importance.py b/python/dalex/test/test_variable_importance.py index c97141a36..7c10d7a8d 100644 --- a/python/dalex/test/test_variable_importance.py +++ b/python/dalex/test/test_variable_importance.py @@ -128,14 +128,14 @@ def test_calculate_variable_importance(self): vi[0].columns).all()) def test_constructor(self): - self.assertIsInstance(self.exp.model_parts(), (dx.dataset_level.VariableImportance,)) - self.assertIsInstance(self.exp.model_parts().result, (pd.DataFrame,)) - self.assertEqual(list(self.exp.model_parts().result.columns), - ['variable', 'dropout_loss', 'label']) - - vi = self.exp.model_parts(keep_raw_permutations=True) - self.assertTrue(hasattr(vi, 'permutation')) - self.assertIsInstance(vi.permutation, pd.DataFrame) + case1 = self.exp.model_parts(), + self.assertIsInstance(case1, (dx.dataset_level.VariableImportance,)) + self.assertIsInstance(case1.result, (pd.DataFrame,)) + self.assertEqual(list(case1.result.columns), ['variable', 'dropout_loss', 'label']) + + case2 = self.exp.model_parts(keep_raw_permutations=True) + self.assertTrue(hasattr(case2, 'permutation')) + self.assertIsInstance(case2.permutation, pd.DataFrame) def test_variables_and_variable_groups(self): From 5dcfe6f879682f2080670a6b6633fad49ba07536 Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 22:26:07 +0200 Subject: [PATCH 09/11] add tests to predict_surrogate --- python/dalex/test/test_model_diagnostics.py | 12 +-- python/dalex/test/test_predict_surrogate | 76 +++++++++++-------- python/dalex/test/test_variable_importance.py | 2 +- 3 files changed, 53 insertions(+), 37 deletions(-) diff --git a/python/dalex/test/test_model_diagnostics.py b/python/dalex/test/test_model_diagnostics.py index f0efbedc5..788bd03be 100644 --- a/python/dalex/test/test_model_diagnostics.py +++ b/python/dalex/test/test_model_diagnostics.py @@ -62,15 +62,17 @@ def test_constructor(self): def test_plot(self): - case1 = self.exp.model_diagnostics() - case2 = self.exp.model_diagnostics(variables=['fare', 'embarked']) + case1 = self.exp.model_diagnostics(variables=['fare', 'embarked']) + case2 = self.exp.model_diagnostics() + case3 = self.exp2.model_diagnostics() self.assertIsInstance(case1, dx.dataset_level.ResidualDiagnostics) self.assertIsInstance(case2, dx.dataset_level.ResidualDiagnostics) + self.assertIsInstance(case3, dx.dataset_level.ResidualDiagnostics) - fig1 = case1.plot(title="test1", show=False) - fig2 = case2.plot(case1, variable="abs_residuals", yvariable="y", show=False) - fig3 = case2.plot(smooth=False, line_width=6, marker_size=1, variable="fare") + fig1 = case1.plot(title="test1", variable="fare", show=False) + fig2 = case2.plot(case3, variable="sibsp", yvariable="abs_residuals", show=False) + fig3 = case2.plot(smooth=False, line_width=6, marker_size=1, variable="age", show=False) self.assertIsInstance(fig1, Figure) self.assertIsInstance(fig2, Figure) diff --git a/python/dalex/test/test_predict_surrogate b/python/dalex/test/test_predict_surrogate index aeadde4e6..4c13a4e50 100644 --- a/python/dalex/test/test_predict_surrogate +++ b/python/dalex/test/test_predict_surrogate @@ -1,48 +1,62 @@ import unittest -import numpy as np -from sklearn.compose import ColumnTransformer -from sklearn.impute import SimpleImputer -from sklearn.neural_network import MLPRegressor -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder +from sklearn.neural_network import MLPClassifier +from sklearn.ensemble import RandomForestRegressor +from sklearn.preprocessing import LabelEncoder import dalex as dx - +import lime class PredictSurrogateTestTitanic(unittest.TestCase): def setUp(self): data = dx.datasets.load_titanic() - data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) - - self.X = data.drop(columns='survived') + self.X = data.drop(columns=['survived', 'class', 'embarked']) self.y = data.survived + self.X.gender = LabelEncoder().fit_transform(self.X.gender) - numeric_features = ['age', 'fare', 'sibsp', 'parch'] - numeric_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler())]) - - categorical_features = ['gender', 'class', 'embarked'] - categorical_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore'))]) - - preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features)]) - - clf = Pipeline(steps=[('preprocessor', preprocessor), - ('classifier', MLPRegressor(hidden_layer_sizes=(150, 100, 50), - max_iter=500, random_state=0))]) + model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=400, random_state=0) + model.fit(self.X, self.y) + self.exp = dx.Explainer(model, self.X, self.y, verbose=False) - clf.fit(self.X, self.y) + data2 = dx.datasets.load_fifa() + self.X2 = data2.drop(["nationality", "overall", "potential", + "value_eur", "wage_eur"], axis=1).iloc[0:2000, 0:10] + self.y2 = data2['value_eur'].iloc[0:2000] - self.exp = dx.Explainer(clf, self.X, self.y, verbose=False) + model2 = RandomForestRegressor(random_state=0) + model2.fit(self.X2, self.y2) + self.exp2 = dx.Explainer(model2, self.X2, self.y2, verbose=False) def test(self): - pass + case1 = self.exp.predict_surrogate(new_observation=self.X.iloc[1, :], + feature_names=self.X.columns) + case2 = self.exp.predict_surrogate(new_observation=self.X.iloc[1:2, :], + mode='classification', + feature_names=self.X.columns, + discretize_continuous=True, + num_features=4) + case3 = self.exp.predict_surrogate(new_observation=self.X.iloc[1:2, :].to_numpy(), + feature_names=self.X.columns, + kernel_width=2, + num_samples=50) + case4 = self.exp2.predict_surrogate(new_observation=self.X2.iloc[1, :], + feature_names=self.X2.columns) + case5 = self.exp2.predict_surrogate(new_observation=self.X2.iloc[1:2, :], + mode='regression', + feature_names=self.X2.columns, + discretize_continuous=True, + num_features=4) + case6 = self.exp2.predict_surrogate(new_observation=self.X2.iloc[1:2, :].to_numpy(), + feature_names=self.X2.columns, + kernel_width=2, + num_samples=50) + + self.assertIsInstance(case1, lime.explanation.Explanation) + self.assertIsInstance(case2, lime.explanation.Explanation) + self.assertIsInstance(case3, lime.explanation.Explanation) + self.assertIsInstance(case4, lime.explanation.Explanation) + self.assertIsInstance(case5, lime.explanation.Explanation) + self.assertIsInstance(case6, lime.explanation.Explanation) if __name__ == '__main__': diff --git a/python/dalex/test/test_variable_importance.py b/python/dalex/test/test_variable_importance.py index 7c10d7a8d..0c478a6b2 100644 --- a/python/dalex/test/test_variable_importance.py +++ b/python/dalex/test/test_variable_importance.py @@ -128,7 +128,7 @@ def test_calculate_variable_importance(self): vi[0].columns).all()) def test_constructor(self): - case1 = self.exp.model_parts(), + case1 = self.exp.model_parts() self.assertIsInstance(case1, (dx.dataset_level.VariableImportance,)) self.assertIsInstance(case1.result, (pd.DataFrame,)) self.assertEqual(list(case1.result.columns), ['variable', 'dropout_loss', 'label']) From 494f0bf539f8f9eeade006f92c94da6ec7bf7c94 Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 22:32:03 +0200 Subject: [PATCH 10/11] fix test --- python/dalex/test/test_model_diagnostics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dalex/test/test_model_diagnostics.py b/python/dalex/test/test_model_diagnostics.py index 788bd03be..b1198085b 100644 --- a/python/dalex/test/test_model_diagnostics.py +++ b/python/dalex/test/test_model_diagnostics.py @@ -56,7 +56,7 @@ def test_constructor(self): self.assertIsInstance(case2, (dx.dataset_level.ResidualDiagnostics,)) self.assertIsInstance(case2.result, (pd.DataFrame,)) self.assertEqual(case2.result.shape[0], self.exp.data.shape[0]) - self.assertTrue(np.isin(['y', 'y_hat', 'residuals', 'abs_residuals', 'label', 'ids', 'age', 'gender'], + self.assertTrue(np.isin(['y', 'y_hat', 'residuals', 'abs_residuals', 'label', 'ids', 'age', 'class'], case2.result.columns).all()) self.assertFalse(np.isin(['fare', 'sibsp', 'gender', 'embarked'], case2.result.columns).any()) From d312ae72286fef7f1bbb0b9056690316c9d805d2 Mon Sep 17 00:00:00 2001 From: Hubert Baniecki Date: Tue, 18 Aug 2020 22:50:10 +0200 Subject: [PATCH 11/11] fix ci --- .../test/{test_predict_surrogate => test_predict_surrogate.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/dalex/test/{test_predict_surrogate => test_predict_surrogate.py} (100%) diff --git a/python/dalex/test/test_predict_surrogate b/python/dalex/test/test_predict_surrogate.py similarity index 100% rename from python/dalex/test/test_predict_surrogate rename to python/dalex/test/test_predict_surrogate.py