Skip to content

Commit

Permalink
FIX IndexError in fetch_openml('zoo')
Browse files Browse the repository at this point in the history
The shape extraction from data_qualities was using NumberOfFeatures,
which excluded the ignored features.
This exclusion caused a bug in the data conversion, since we tried
to reshape the whole dataset with a lower number of features.

This fix uses data_features to include ignored features in the shape
extraction

Fixes scikit-learn#14340
  • Loading branch information
HABCHI Sarra committed Aug 13, 2019
1 parent 9014a6f commit e6e8a1d
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 14 deletions.
37 changes: 23 additions & 14 deletions sklearn/datasets/openml.py
Expand Up @@ -424,17 +424,28 @@ def _get_data_qualities(data_id, data_home):
return None


def _get_data_shape(data_qualities):
# Using the data_info dictionary from _get_data_info_by_name to extract
# the number of samples / features
def _get_num_samples(data_qualities):
"""Get the number of samples from data qualities
Parameters
----------
data_qualities : list of dict
Used to retrieve the number of instances (samples) in the dataset
Returns
-------
instances : int
The number of samples in the dataset or -1 if data qualities are
unavailable
"""
# If the data qualities are unavailable, we return -1
default_n_samples = -1

if data_qualities is None:
return None
return default_n_samples

qualities = {d['name']: d['value'] for d in data_qualities}
try:
return (int(float(qualities['NumberOfInstances'])),
int(float(qualities['NumberOfFeatures'])))
except AttributeError:
return None
return int(float(qualities.get('NumberOfInstances', default_n_samples)))


def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
Expand Down Expand Up @@ -708,12 +719,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,

# determine arff encoding to return
if not return_sparse:
# The shape must include the ignored features to keep the right indexes
# during the arff data conversion.
data_qualities = _get_data_qualities(data_id, data_home)
shape = _get_data_shape(data_qualities)
# if the data qualities were not available, we can still get the
# n_features from the feature list, with the n_samples unknown
if shape is None:
shape = (-1, len(features_list))
shape = _get_num_samples(data_qualities), len(features_list)
else:
shape = None

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15 changes: 15 additions & 0 deletions sklearn/datasets/tests/test_openml.py
Expand Up @@ -1158,3 +1158,18 @@ def test_fetch_openml_raises_illegal_argument():

assert_raise_message(ValueError, "Neither name nor data_id are provided. "
"Please provide name or data_id.", fetch_openml)


@pytest.mark.parametrize('gzip_response', [True, False])
def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
# Regression test for #14340
# 62 is the ID of the ZOO dataset
data_id = 62
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)

dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False)
assert dataset is not None
# The dataset has 17 features, including 1 ignored (animal),
# so we assert that we don't have the ignored feature in the final Bunch
assert dataset['data'].shape == (101, 16)
assert 'animal' not in dataset['feature_names']

0 comments on commit e6e8a1d

Please sign in to comment.