Allow datetime columns #28

armgilles · 2022-09-20T14:14:19Z

Description of Problem:

You can't pass datetime columns in eurybia

...
sd = SmartDrift(
  df_current=df_current,   # with datetime column
  df_baseline=df_baseline  # with datetime column
)
sd.compile(full_validation=True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File _catboost.pyx:1130, in _catboost._FloatOrNan()

TypeError: float() argument must be a string or a number, not 'Timestamp'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
File _catboost.pyx:2275, in _catboost.get_float_feature()

File _catboost.pyx:1132, in _catboost._FloatOrNan()

TypeError: Cannot convert obj 2022-01-01 00:00:00 to float

During handling of the above exception, another exception occurred:

CatBoostError                             Traceback (most recent call last)
Cell In [25], line 1
----> 1 sd.compile(full_validation=True)

File ~/github/eurybia/eurybia/core/smartdrift.py:305, in SmartDrift.compile(self, full_validation, ignore_cols, sampling, sample_size, datadrift_file, date_compile_auc, hyperparameter, attr_importance)
    302 x_test = test[varz]
    303 y_test = test[self._datadrift_target]
--> 305 xpl.compile(x=x_test)
    306 xpl.compute_features_import(force=True)
    308 self.xpl = xpl

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/shapash/explainer/smart_explainer.py:267, in SmartExplainer.compile(self, x, contributions, y_pred)
    264 self.x_init = inverse_transform(self.x_encoded, self.preprocessing)
    265 self.y_pred = check_ypred(self.x_init, y_pred)
--> 267 self._get_contributions_from_backend_or_user(x, contributions)
    268 self.check_contributions()
    270 self.columns_dict = {i: col for i, col in enumerate(self.x_init.columns)}

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/shapash/explainer/smart_explainer.py:288, in SmartExplainer._get_contributions_from_backend_or_user(self, x, contributions)
    285 def _get_contributions_from_backend_or_user(self, x, contributions):
    286     # Computing contributions using backend
    287     if contributions is None:
--> 288         self.explain_data = self.backend.run_explainer(x=x)
    289         self.contributions = self.backend.get_local_contributions(x=x, explain_data=self.explain_data)
    290     else:

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/shapash/backend/shap_backend.py:34, in ShapBackend.run_explainer(self, x)
     20 def run_explainer(self, x: pd.DataFrame) -> dict:
     21     """
     22     Computes and returns local contributions using Shap explainer
     23 
   (...)
     32         local contributions
     33     """
---> 34     contributions = self.explainer(x, **self.explainer_compute_args)
     35     explain_data = dict(contributions=contributions.values)
     36     return explain_data

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/shap/explainers/_tree.py:217, in Tree.__call__(self, X, y, interactions, check_additivity)
    214     feature_names = getattr(self, "data_feature_names", None)
    216 if not interactions:
--> 217     v = self.shap_values(X, y=y, from_call=True, check_additivity=check_additivity, approximate=self.approximate)
    218     if type(v) is list:
    219         v = np.stack(v, axis=-1) # put outputs at the end

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/shap/explainers/_tree.py:367, in Tree.shap_values(self, X, y, tree_limit, approximate, check_additivity, from_call)
    365     import catboost
    366     if type(X) != catboost.Pool:
--> 367         X = catboost.Pool(X, cat_features=self.model.cat_feature_indices)
    368     phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapValues')
    370 # note we pull off the last column and keep it as our expected_value

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/catboost/core.py:790, in Pool.__init__(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count, log_cout, log_cerr)
    784         if isinstance(feature_names, PATH_TYPES):
    785             raise CatBoostError(
    786                 "feature_names must be None or have non-string type when the pool is created from "
    787                 "python objects."
    788             )
--> 790         self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
    791                    group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
    792 super(Pool, self).__init__()

File ~/anaconda3/envs/eurybia/lib/python3.8/site-packages/catboost/core.py:1411, in Pool._init(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
   1409 if feature_tags is not None:
   1410     feature_tags = self._check_transform_tags(feature_tags, feature_names)
-> 1411 self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
   1412                 group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)

File _catboost.pyx:3941, in _catboost._PoolBase._init_pool()

File _catboost.pyx:4008, in _catboost._PoolBase._init_pool()

File _catboost.pyx:3914, in _catboost._PoolBase._init_objects_order_layout_pool()

File _catboost.pyx:3422, in _catboost._set_data()

File _catboost.pyx:3405, in _catboost._set_data_from_generic_matrix()

File _catboost.pyx:2277, in _catboost.get_float_feature()

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="2022-01-01 00:00:00": Cannot convert obj 2022-01-01 00:00:00 to float

But in some use case, Eurybia should be useful to analyse difference between 2 dataset with temporal information (like seasonal information). If users only want to get some analysis about difference between 2 dataset, it should be done (via AUC). But if users want to reuse a model to get importance, this should raise an error (and invite him to drop datetime columns as it can't be done).

Overview of the Solution:

If there are datetime columns in datasets, automatically create years / month / day features based on this column and drop original one.
If deployed_model is filled in SmartDrift then raised an error.

Examples:

import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from eurybia import SmartDrift

# Create random dataset
date_list = pd.date_range(start='01/01/2022', end='01/30/2022')
X1 = np.random.rand(len(date_list))
X2 = np.random.rand(len(date_list))

df_current = pd.DataFrame(date_list, columns=['date'])
df_current['col1'] = X1 
df_baseline = pd.DataFrame(date_list, columns=['date'])
df_baseline['col1'] = X2

sd = SmartDrift(df_current=df_current,
  				df_baseline=df_baseline)
# Datetime columns will be transform into df_current
# Datetime columns will be transform into df_baseline

sd.compile(full_validation=True)

# Bloc user when using model
# Random models
regressor = LGBMRegressor(n_estimators=2).fit(df_baseline[['col1']], 
                                              df_baseline[['col1']])

sd = SmartDrift(df_current=df_current,
  				df_baseline=df_baseline,
  				deployed_model=regressor)
sd.compile(full_validation=True)
# Error
# Raising error

Blockers:

Definition of Done:

Some tests

The text was updated successfully, but these errors were encountered:

armgilles mentioned this issue Sep 21, 2022

Feature/transf date #29

Merged

ThomasBouche closed this as completed Sep 23, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Allow datetime columns #28

Allow datetime columns #28

armgilles commented Sep 20, 2022

Allow datetime columns #28

Allow datetime columns #28

Comments

armgilles commented Sep 20, 2022