FIX IndexError in fetch_openml('zoo')

The shape extraction from data_qualities was using NumberOfFeatures, which excluded the ignored features. This exclusion caused a bug in the data conversion, since we tried to reshape the whole dataset with a lower number of features. This fix uses data_features to include ignored features in the shape extraction Fixes scikit-learn#14340
HabchiSarra · Aug 13, 2019 · e6e8a1d · e6e8a1d
1 parent 9014a6f
commit e6e8a1d
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 14 deletions.
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
@@ -424,17 +424,28 @@ def _get_data_qualities(data_id, data_home):
         return None
 
 
-def _get_data_shape(data_qualities):
-    # Using the data_info dictionary from _get_data_info_by_name to extract
-    # the number of samples / features
+def _get_num_samples(data_qualities):
+    """Get the number of samples from data qualities
+
+    Parameters
+    ----------
+    data_qualities : list of dict
+        Used to retrieve the number of instances (samples) in the dataset
+
+    Returns
+    -------
+    instances : int
+        The number of samples in the dataset or -1 if data qualities are
+        unavailable
+    """
+    # If the data qualities are unavailable, we return -1
+    default_n_samples = -1
+
     if data_qualities is None:
-        return None
+        return default_n_samples
+
     qualities = {d['name']: d['value'] for d in data_qualities}
-    try:
-        return (int(float(qualities['NumberOfInstances'])),
-                int(float(qualities['NumberOfFeatures'])))
-    except AttributeError:
-        return None
+    return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
 def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
@@ -708,12 +719,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     # determine arff encoding to return
     if not return_sparse:
+        # The shape must include the ignored features to keep the right indexes
+        # during the arff data conversion.
         data_qualities = _get_data_qualities(data_id, data_home)
-        shape = _get_data_shape(data_qualities)
-        # if the data qualities were not available, we can still get the
-        # n_features from the feature list, with the n_samples unknown
-        if shape is None:
-            shape = (-1, len(features_list))
+        shape = _get_num_samples(data_qualities), len(features_list)
     else:
         shape = None
 

diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz b/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -1158,3 +1158,18 @@ def test_fetch_openml_raises_illegal_argument():
 
     assert_raise_message(ValueError, "Neither name nor data_id are provided. "
                          "Please provide name or data_id.", fetch_openml)
+
+
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
+    # Regression test for #14340
+    # 62 is the ID of the ZOO dataset
+    data_id = 62
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
+
+    dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False)
+    assert dataset is not None
+    # The dataset has 17 features, including 1 ignored (animal),
+    # so we assert that we don't have the ignored feature in the final Bunch
+    assert dataset['data'].shape == (101, 16)
+    assert 'animal' not in dataset['feature_names']