Skip to content

Commit

Permalink
FIX IndexError in fetch_openml('zoo')
Browse files Browse the repository at this point in the history
The shape extraction from data_qualities was using NumberOfFeatures,
which excluded the ignored features.
This exclusion caused a bug in the data conversion, since we tried
to reshape the whole dataset with a lower number of features.

This fix uses data_features to include ignored features in the shape
extraction

Fixes scikit-learn#14340
  • Loading branch information
HABCHI Sarra committed Aug 12, 2019
1 parent c459b8b commit c6fdb40
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 12 deletions.
39 changes: 27 additions & 12 deletions sklearn/datasets/openml.py
Expand Up @@ -424,17 +424,34 @@ def _get_data_qualities(data_id, data_home):
return None


def _get_data_shape(data_qualities):
# Using the data_info dictionary from _get_data_info_by_name to extract
# the number of samples / features
def _get_data_instances(data_qualities):
"""
Retrieve the number of instances (samples) from data qualities
Parameters
----------
data_qualities : list
used to retrieve the number of instances (samples) in the dataset
Returns
-------
instances : int
the number of samples in the dataset or -1 if data qualities are
unavailable
"""
# If the data qualities are not available, we return -1
default_instances = -1

if data_qualities is None:
return None
return default_instances

qualities = {d['name']: d['value'] for d in data_qualities}
try:
return (int(float(qualities['NumberOfInstances'])),
int(float(qualities['NumberOfFeatures'])))
instances = int(float(qualities['NumberOfInstances']))
except AttributeError:
return None
return default_instances
else:
return instances


def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
Expand Down Expand Up @@ -708,12 +725,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,

# determine arff encoding to return
if not return_sparse:
# The shape must include the ignored features to keep the right indexes
# during the arff data conversion.
data_qualities = _get_data_qualities(data_id, data_home)
shape = _get_data_shape(data_qualities)
# if the data qualities were not available, we can still get the
# n_features from the feature list, with the n_samples unknown
if shape is None:
shape = (-1, len(features_list))
shape = _get_data_instances(data_qualities), len(features_list)
else:
shape = None

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
13 changes: 13 additions & 0 deletions sklearn/datasets/tests/test_openml.py
Expand Up @@ -1158,3 +1158,16 @@ def test_fetch_openml_raises_illegal_argument():

assert_raise_message(ValueError, "Neither name nor data_id are provided. "
"Please provide name or data_id.", fetch_openml)


@pytest.mark.parametrize('gzip_response', [True, False])
def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
# 62 is the ID of the ZOO dataset
data_id = 62
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)

dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False)
assert dataset is not None
# The dataset has 17 features, including 1 ignored,
# so we assert that we don't have the ignored feature in the final Bunch
assert dataset['data'].shape == (101, 16)

0 comments on commit c6fdb40

Please sign in to comment.