Skip to content

Commit

Permalink
DEPR: DataFrame(floaty, dtype=inty) match Series (pandas-dev#41770)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and JulianWgs committed Jul 3, 2021
1 parent deb471a commit d53dfd8
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 27 deletions.
4 changes: 3 additions & 1 deletion asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,9 @@ class Rank:
]

def setup(self, dtype):
self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype)
self.df = DataFrame(
np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype
)

def time_rank(self, dtype):
self.df.rank()
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ Deprecations
- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`)
- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`)
- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`)
- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Dtype,
DtypeObj,
)
from pandas.errors import IntCastingNaNError

from pandas.core.dtypes.base import (
ExtensionDtype,
Expand Down Expand Up @@ -511,7 +512,24 @@ def sanitize_array(
# possibility of nan -> garbage
try:
subarr = _try_cast(data, dtype, copy, True)
except IntCastingNaNError:
subarr = np.array(data, copy=copy)
except ValueError:
if not raise_cast_failure:
# i.e. called via DataFrame constructor
warnings.warn(
"In a future version, passing float-dtype values and an "
"integer dtype to DataFrame will retain floating dtype "
"if they cannot be cast losslessly (matching Series behavior). "
"To retain the old behavior, use DataFrame(data).astype(dtype)",
FutureWarning,
stacklevel=4,
)
# GH#40110 until the deprecation is enforced, we _dont_
# ignore the dtype for DataFrame, and _do_ cast even though
# it is lossy.
dtype = cast(np.dtype, dtype)
return np.array(data, dtype=dtype, copy=copy)
subarr = np.array(data, copy=copy)
else:
# we will try to copy by-definition here
Expand Down
19 changes: 18 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array(
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
raise OverflowError("Trying to coerce negative values to unsigned integers")

if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype):
if is_float_dtype(arr.dtype):
if not np.isfinite(arr).all():
raise IntCastingNaNError(
"Cannot convert non-finite values (NA or inf) to integer"
)
raise ValueError("Trying to coerce float values to integers")
if is_object_dtype(arr.dtype):
raise ValueError("Trying to coerce float values to integers")

if casted.dtype < arr.dtype:
Expand All @@ -2102,6 +2108,17 @@ def maybe_cast_to_integer_array(
)
return casted

if arr.dtype.kind in ["m", "M"]:
# test_constructor_maskedarray_nonfloat
warnings.warn(
f"Constructing Series or DataFrame from {arr.dtype} values and "
f"dtype={dtype} is deprecated and will raise in a future version. "
"Use values.view(dtype) instead",
FutureWarning,
stacklevel=find_stack_level(),
)
return casted

# No known cases that get here, but raising explicitly to cover our bases.
raise ValueError(f"values cannot be losslessly cast to {dtype}")

Expand Down
24 changes: 6 additions & 18 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,9 @@
DtypeObj,
Manager,
)
from pandas.errors import IntCastingNaNError

from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
construct_1d_ndarray_preserving_na,
maybe_cast_to_datetime,
maybe_convert_platform,
maybe_infer_to_datetimelike,
Expand Down Expand Up @@ -303,22 +301,12 @@ def ndarray_to_mgr(
shape = values.shape
flat = values.ravel()

if not is_integer_dtype(dtype):
# TODO: skipping integer_dtype is needed to keep the tests passing,
# not clear it is correct
# Note: we really only need _try_cast, but keeping to exposed funcs
values = sanitize_array(
flat, None, dtype=dtype, copy=copy, raise_cast_failure=True
)
else:
try:
values = construct_1d_ndarray_preserving_na(
flat, dtype=dtype, copy=False
)
except IntCastingNaNError:
# following Series, we ignore the dtype and retain floating
# values instead of casting nans to meaningless ints
pass
# GH#40110 see similar check inside sanitize_array
rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

values = sanitize_array(
flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf
)

values = values.reshape(shape)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_sort_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,15 +603,15 @@ def test_sort_index_level_large_cardinality(self):

# GH#2684 (int64)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
df = DataFrame(np.random.randn(4000).astype("int64"), index=index)

# it works!
result = df.sort_index(level=0)
assert result.index._lexsort_depth == 3

# GH#2684 (int32)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
df = DataFrame(np.random.randn(4000).astype("int32"), index=index)

# it works!
result = df.sort_index(level=0)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,9 @@ def create_cols(name):
np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
)
df_int = DataFrame(
np.random.randn(100, 5), dtype="int64", columns=create_cols("int")
np.random.randn(100, 5).astype("int64"),
dtype="int64",
columns=create_cols("int"),
)
df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
df_object = DataFrame(
Expand Down Expand Up @@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self):
tm.assert_frame_equal(result, df)

df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
Expand Down
34 changes: 33 additions & 1 deletion pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import functools
import itertools
import re
import warnings

import numpy as np
import numpy.ma as ma
Expand Down Expand Up @@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self):
assert isna(frame).values.all()

# cast type
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
msg = r"datetime64\[ns\] values and dtype=int64"
with tm.assert_produces_warning(FutureWarning, match=msg):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
category=DeprecationWarning,
message="elementwise comparison failed",
)
frame = DataFrame(
mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
)
assert frame.values.dtype == np.int64

# Check non-masked values
Expand Down Expand Up @@ -2484,6 +2495,27 @@ def test_nested_list_columns(self):
tm.assert_frame_equal(result, expected)


class TestDataFrameConstructorWithDtypeCoercion:
def test_floating_values_integer_dtype(self):
# GH#40110 make DataFrame behavior with arraylike floating data and
# inty dtype match Series behavior

arr = np.random.randn(10, 5)

msg = "if they cannot be cast losslessly"
with tm.assert_produces_warning(FutureWarning, match=msg):
DataFrame(arr, dtype="i8")

with tm.assert_produces_warning(None):
# if they can be cast losslessly, no warning
DataFrame(arr.round(), dtype="i8")

# with NaNs, we already have the correct behavior, so no warning
arr[0, 0] = np.nan
with tm.assert_produces_warning(None):
DataFrame(arr, dtype="i8")


class TestDataFrameConstructorWithDatetimeTZ:
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_construction_preserves_tzaware_dtypes(self, tz):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_nonunique_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def test_multi_dtype2(self):
def test_dups_across_blocks(self, using_array_manager):
# dups across blocks
df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
df_int = DataFrame(np.random.randn(10, 3).astype("int64"))
df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
df_dt = DataFrame(
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request):
)
request.node.add_marker(mark)

exp = pd.Series([1, val, 3, 4], dtype=np.int8)
warn = None if exp_dtype is np.int8 else FutureWarning
msg = "Values are too large to be losslessly cast to int8"
with tm.assert_produces_warning(warn, match=msg):
exp = pd.Series([1, val, 3, 4], dtype=np.int8)
self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)

@pytest.mark.parametrize(
Expand Down

0 comments on commit d53dfd8

Please sign in to comment.