Skip to content

Commit

Permalink
BUG: astype fill_value for SparseArray.astype (pandas-dev#23547)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and JustinZhengBC committed Nov 14, 2018
1 parent d8826bf commit 3b87703
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 14 deletions.
105 changes: 91 additions & 14 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,83 @@ def is_dtype(cls, dtype):
return True
return isinstance(dtype, np.dtype) or dtype == 'Sparse'

def update_dtype(self, dtype):
"""Convert the SparseDtype to a new dtype.
This takes care of converting the ``fill_value``.
Parameters
----------
dtype : Union[str, numpy.dtype, SparseDtype]
The new dtype to use.
* For a SparseDtype, it is simply returned
* For a NumPy dtype (or str), the current fill value
is converted to the new dtype, and a SparseDtype
with `dtype` and the new fill value is returned.
Returns
-------
SparseDtype
A new SparseDtype with the corret `dtype` and fill value
for that `dtype`.
Raises
------
ValueError
When the current fill value cannot be converted to the
new `dtype` (e.g. trying to convert ``np.nan`` to an
integer dtype).
Examples
--------
>>> SparseDtype(int, 0).update_dtype(float)
Sparse[float64, 0.0]
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
Sparse[float64, nan]
"""
cls = type(self)
dtype = pandas_dtype(dtype)

if not isinstance(dtype, cls):
fill_value = astype_nansafe(np.array(self.fill_value),
dtype).item()
dtype = cls(dtype, fill_value=fill_value)

return dtype

@property
def _subtype_with_str(self):
"""
Whether the SparseDtype's subtype should be considered ``str``.
Typically, pandas will store string data in an object-dtype array.
When converting values to a dtype, e.g. in ``.astype``, we need to
be more specific, we need the actual underlying type.
Returns
-------
>>> SparseDtype(int, 1)._subtype_with_str
dtype('int64')
>>> SparseDtype(object, 1)._subtype_with_str
dtype('O')
>>> dtype = SparseDtype(str, '')
>>> dtype.subtype
dtype('O')
>>> dtype._subtype_with_str
str
"""
if isinstance(self.fill_value, compat.string_types):
return type(self.fill_value)
return self.subtype


# ----------------------------------------------------------------------------
# Array

Expand Down Expand Up @@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True):
# Can't put pd.NaT in a datetime64[ns]
fill_value = np.datetime64('NaT')
try:
dtype = np.result_type(self.sp_values.dtype, fill_value)
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
except TypeError:
dtype = object

Expand Down Expand Up @@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None):
if len(self) == 0:
# Empty... Allow taking only if all empty
if (indices == -1).all():
dtype = np.result_type(self.sp_values, fill_value)
dtype = np.result_type(self.sp_values, type(fill_value))
taken = np.empty_like(indices, dtype=dtype)
taken.fill(fill_value)
return taken
Expand All @@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None):
if self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
taken = np.full(sp_indexer.shape, fill_value=fill_value,
dtype=np.result_type(fill_value))
dtype=np.result_type(type(fill_value)))
else:
taken = self.sp_values.take(sp_indexer)

Expand All @@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None):
result_type = taken.dtype

if m0.any():
result_type = np.result_type(result_type, self.fill_value)
result_type = np.result_type(result_type,
type(self.fill_value))
taken = taken.astype(result_type)
taken[old_fill_indices] = self.fill_value

if m1.any():
result_type = np.result_type(result_type, fill_value)
result_type = np.result_type(result_type, type(fill_value))
taken = taken.astype(result_type)
taken[new_fill_indices] = fill_value

Expand All @@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices):
# edge case in take...
# I think just return
out = np.full(indices.shape, self.fill_value,
dtype=np.result_type(self.fill_value))
dtype=np.result_type(type(self.fill_value)))
arr, sp_index, fill_value = make_sparse(out,
fill_value=self.fill_value)
return type(self)(arr, sparse_index=sp_index,
Expand All @@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices):

if fillable.any():
# TODO: may need to coerce array to fill value
result_type = np.result_type(taken, self.fill_value)
result_type = np.result_type(taken, type(self.fill_value))
taken = taken.astype(result_type)
taken[fillable] = self.fill_value

Expand All @@ -1093,7 +1171,9 @@ def _concat_same_type(cls, to_concat):

fill_value = fill_values[0]

if len(set(fill_values)) > 1:
# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore tha all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn("Concatenating sparse arrays with multiple fill "
"values: '{}'. Picking the first and "
"converting the rest.".format(fill_values),
Expand Down Expand Up @@ -1212,13 +1292,10 @@ def astype(self, dtype=None, copy=True):
IntIndex
Indices: array([2, 3], dtype=int32)
"""
dtype = pandas_dtype(dtype)

if not isinstance(dtype, SparseDtype):
dtype = SparseDtype(dtype, fill_value=self.fill_value)

dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
sp_values = astype_nansafe(self.sp_values,
dtype.subtype,
subtype,
copy=copy)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,34 @@ def test_astype_all(self, any_real_dtype):
tm.assert_numpy_array_equal(np.asarray(res.values),
vals.astype(typ))

@pytest.mark.parametrize('array, dtype, expected', [
(SparseArray([0, 1]), 'float',
SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(SparseArray([0, 1], fill_value=1), bool,
SparseArray([False, True], dtype=SparseDtype(bool, True))),
pytest.param(
SparseArray([0, 1]), 'datetime64[ns]',
SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
dtype=SparseDtype('datetime64[ns]',
pd.Timestamp('1970'))),
marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
),
(SparseArray([0, 1, 10]), str,
SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
(SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
(SparseArray([0, 1, 0]), object,
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
])
def test_astype_more(self, array, dtype, expected):
result = array.astype(dtype)
tm.assert_sp_array_equal(result, expected)

def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with pytest.raises(ValueError, match='Cannot convert non-finite'):
arr.astype(int)

def test_set_fill_value(self):
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
arr.fill_value = 2
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/arrays/sparse/test_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,23 @@ def test_parse_subtype(string, expected):
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match='fill_value in the string is not'):
SparseDtype.construct_from_string(string)


@pytest.mark.parametrize('original, dtype, expected', [
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, '1')),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
])
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected


@pytest.mark.parametrize("original, dtype", [
(SparseDtype(float, np.nan), int),
(SparseDtype(str, 'abc'), int),
])
def test_update_dtype_raises(original, dtype):
with pytest.raises(ValueError):
original.update_dtype(dtype)

0 comments on commit 3b87703

Please sign in to comment.