Skip to content

Commit

Permalink
[python-package] take shallow copy of dataframe in predict (fixes #6195
Browse files Browse the repository at this point in the history
…) (#6218)
  • Loading branch information
jmoralez committed Dec 7, 2023
1 parent 4aba4fc commit e797985
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
5 changes: 4 additions & 1 deletion python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,10 @@ def _data_from_pandas(
if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.')

# take shallow copy in case we modify categorical columns
# whole column modifications don't change the original df
data = data.copy(deep=False)

# determine feature names
if feature_name == 'auto':
feature_name = [str(col) for col in data.columns]
Expand All @@ -805,7 +809,6 @@ def _data_from_pandas(
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
Expand Down
19 changes: 16 additions & 3 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):


@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
@pytest.mark.parametrize('categories', ['seen', 'unseen'])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
pd = pytest.importorskip('pandas')
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
if categories == 'seen':
pandas_categorical = [['a', 'b']]
else:
pandas_categorical = [['a']]
data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
pandas_categorical=pandas_categorical,
)[0]
# check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes
np.testing.assert_equal(df[column_name].cat.codes, data[:, 0])
if categories == 'seen':
# if all categories were seen during training we just take the codes
codes = df[column_name].cat.codes
else:
# if we only saw 'a' during training we just replace its code
# and leave the rest as nan
a_code = df[column_name].cat.categories.get_loc('a')
codes = np.where(df[column_name] == 'a', a_code, np.nan)
np.testing.assert_equal(codes, data[:, 0])


@pytest.mark.parametrize('min_data_in_bin', [2, 10])
Expand Down

0 comments on commit e797985

Please sign in to comment.