FIX IndexError in fetch_openml('zoo')

The shape extraction from data_qualities was using NumberOfFeatures, which excluded the ignored features. This exclusion caused a bug in the data conversion, since we tried to reshape the whole dataset with a lower number of features. This fix uses data_features to include ignored features in the shape extraction Fixes scikit-learn#14340
HabchiSarra · Aug 12, 2019 · c6fdb40 · c6fdb40
1 parent c459b8b
commit c6fdb40
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 12 deletions.
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
@@ -424,17 +424,34 @@ def _get_data_qualities(data_id, data_home):
         return None
 
 
-def _get_data_shape(data_qualities):
-    # Using the data_info dictionary from _get_data_info_by_name to extract
-    # the number of samples / features
+def _get_data_instances(data_qualities):
+    """
+    Retrieve the number of instances (samples) from data qualities
+
+    Parameters
+    ----------
+    data_qualities : list
+        used to retrieve the number of instances (samples) in the dataset
+
+    Returns
+    -------
+    instances : int
+        the number of samples in the dataset or -1 if data qualities are
+        unavailable
+    """
+    # If the data qualities are not available, we return -1
+    default_instances = -1
+
     if data_qualities is None:
-        return None
+        return default_instances
+
     qualities = {d['name']: d['value'] for d in data_qualities}
     try:
-        return (int(float(qualities['NumberOfInstances'])),
-                int(float(qualities['NumberOfFeatures'])))
+        instances = int(float(qualities['NumberOfInstances']))
     except AttributeError:
-        return None
+        return default_instances
+    else:
+        return instances
 
 
 def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
@@ -708,12 +725,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     # determine arff encoding to return
     if not return_sparse:
+        # The shape must include the ignored features to keep the right indexes
+        # during the arff data conversion.
         data_qualities = _get_data_qualities(data_id, data_home)
-        shape = _get_data_shape(data_qualities)
-        # if the data qualities were not available, we can still get the
-        # n_features from the feature list, with the n_samples unknown
-        if shape is None:
-            shape = (-1, len(features_list))
+        shape = _get_data_instances(data_qualities), len(features_list)
     else:
         shape = None
 

diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz b/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -1158,3 +1158,16 @@ def test_fetch_openml_raises_illegal_argument():
 
     assert_raise_message(ValueError, "Neither name nor data_id are provided. "
                          "Please provide name or data_id.", fetch_openml)
+
+
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
+    # 62 is the ID of the ZOO dataset
+    data_id = 62
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
+
+    dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False)
+    assert dataset is not None
+    # The dataset has 17 features, including 1 ignored,
+    # so we assert that we don't have the ignored feature in the final Bunch
+    assert dataset['data'].shape == (101, 16)