Skip to content

Commit

Permalink
[MRG+1] _preprocess_data consistent with fused types (scikit-learn#9093)
Browse files Browse the repository at this point in the history
* add test for _preprocess_data and make it consistent

* fix pep8

* add doc, cast systematically y in X.dtype and update test_coordinate_descent.py

* test if input values don't change with copy=True

* test if input values don't change with copy=True #2

* fix doc

* fix doc #2

* fix doc #3
  • Loading branch information
Henley13 authored and MechCoder committed Jun 23, 2017
1 parent d15128b commit 89962f0
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 20 deletions.
12 changes: 8 additions & 4 deletions sklearn/linear_model/base.py
Expand Up @@ -158,20 +158,21 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
coordinate_descend).
This is here because nearly all linear models will want their data to be
centered.
centered. This function also systematically makes y consistent with X.dtype
"""

if isinstance(sample_weight, numbers.Number):
sample_weight = None

X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
dtype=FLOAT_DTYPES)
y = np.asarray(y, dtype=X.dtype)

if fit_intercept:
if sp.issparse(X):
X_offset, X_var = mean_variance_axis(X, axis=0)
if not return_mean:
X_offset[:] = 0
X_offset[:] = X.dtype.type(0)

if normalize:

Expand Down Expand Up @@ -201,7 +202,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
else:
X_offset = np.zeros(X.shape[1], dtype=X.dtype)
X_scale = np.ones(X.shape[1], dtype=X.dtype)
y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
if y.ndim == 1:
y_offset = X.dtype.type(0)
else:
y_offset = np.zeros(y.shape[1], dtype=X.dtype)

return X, y, X_offset, y_offset, X_scale

Expand Down Expand Up @@ -460,7 +464,7 @@ def fit(self, X, y, sample_weight=None):
Training data
y : numpy array of shape [n_samples, n_targets]
Target values
Target values. Will be cast to X's dtype if necessary
sample_weight : numpy array of shape [n_samples]
Individual weights for each sample
Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/bayes.py
Expand Up @@ -148,7 +148,7 @@ def fit(self, X, y):
X : numpy array of shape [n_samples,n_features]
Training data
y : numpy array of shape [n_samples]
Target values
Target values. Will be cast to X's dtype if necessary
Returns
-------
Expand Down Expand Up @@ -420,7 +420,7 @@ def fit(self, X, y):
Training vector, where n_samples in the number of samples and
n_features is the number of features.
y : array, shape = [n_samples]
Target values (integers)
Target values (integers). Will be cast to X's dtype if necessary
Returns
-------
Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/coordinate_descent.py
Expand Up @@ -653,7 +653,7 @@ def fit(self, X, y, check_input=True):
Data
y : ndarray, shape (n_samples,) or (n_samples, n_targets)
Target
Target. Will be cast to X's dtype if necessary
check_input : boolean, (default=True)
Allow to bypass several input checking.
Expand Down Expand Up @@ -1680,7 +1680,7 @@ def fit(self, X, y):
X : ndarray, shape (n_samples, n_features)
Data
y : ndarray, shape (n_samples, n_tasks)
Target
Target. Will be cast to X's dtype if necessary
Notes
-----
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/least_angle.py
Expand Up @@ -1455,7 +1455,7 @@ def fit(self, X, y, copy_X=True):
training data.
y : array-like, shape (n_samples,)
target values.
target values. Will be cast to X's dtype if necessary
copy_X : boolean, optional, default True
If ``True``, X will be copied; else, it may be overwritten.
Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/omp.py
Expand Up @@ -617,7 +617,7 @@ def fit(self, X, y):
Training data.
y : array-like, shape (n_samples,) or (n_samples, n_targets)
Target values.
Target values. Will be cast to X's dtype if necessary
Returns
Expand Down Expand Up @@ -835,7 +835,7 @@ def fit(self, X, y):
Training data.
y : array-like, shape [n_samples]
Target values.
Target values. Will be cast to X's dtype if necessary
Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/randomized_l1.py
Expand Up @@ -82,7 +82,7 @@ def fit(self, X, y):
Training data.
y : array-like, shape = [n_samples]
Target values.
Target values. Will be cast to X's dtype if necessary
Returns
-------
Expand Down
6 changes: 3 additions & 3 deletions sklearn/linear_model/ridge.py
Expand Up @@ -975,7 +975,7 @@ def fit(self, X, y, sample_weight=None):
Training data
y : array-like, shape = [n_samples] or [n_samples, n_targets]
Target values
Target values. Will be cast to X's dtype if necessary
sample_weight : float or array-like of shape [n_samples]
Sample weight
Expand Down Expand Up @@ -1094,7 +1094,7 @@ def fit(self, X, y, sample_weight=None):
Training data
y : array-like, shape = [n_samples] or [n_samples, n_targets]
Target values
Target values. Will be cast to X's dtype if necessary
sample_weight : float or array-like of shape [n_samples]
Sample weight
Expand Down Expand Up @@ -1336,7 +1336,7 @@ def fit(self, X, y, sample_weight=None):
and n_features is the number of features.
y : array-like, shape (n_samples,)
Target values.
Target values. Will be cast to X's dtype if necessary
sample_weight : float or numpy array of shape (n_samples,)
Sample weight.
Expand Down
66 changes: 66 additions & 0 deletions sklearn/linear_model/tests/test_base.py
Expand Up @@ -324,6 +324,72 @@ def test_csr_preprocess_data():
assert_equal(csr_.getformat(), 'csr')


def test_dtype_preprocess_data():
n_samples = 200
n_features = 2
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)

X_32 = np.asarray(X, dtype=np.float32)
y_32 = np.asarray(y, dtype=np.float32)
X_64 = np.asarray(X, dtype=np.float64)
y_64 = np.asarray(y, dtype=np.float64)

for fit_intercept in [True, False]:
for normalize in [True, False]:

Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
X_32, y_32, fit_intercept=fit_intercept, normalize=normalize,
return_mean=True)

Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
X_64, y_64, fit_intercept=fit_intercept, normalize=normalize,
return_mean=True)

Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = (
_preprocess_data(X_32, y_64, fit_intercept=fit_intercept,
normalize=normalize, return_mean=True))

Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = (
_preprocess_data(X_64, y_32, fit_intercept=fit_intercept,
normalize=normalize, return_mean=True))

assert_equal(Xt_32.dtype, np.float32)
assert_equal(yt_32.dtype, np.float32)
assert_equal(X_mean_32.dtype, np.float32)
assert_equal(y_mean_32.dtype, np.float32)
assert_equal(X_norm_32.dtype, np.float32)

assert_equal(Xt_64.dtype, np.float64)
assert_equal(yt_64.dtype, np.float64)
assert_equal(X_mean_64.dtype, np.float64)
assert_equal(y_mean_64.dtype, np.float64)
assert_equal(X_norm_64.dtype, np.float64)

assert_equal(Xt_3264.dtype, np.float32)
assert_equal(yt_3264.dtype, np.float32)
assert_equal(X_mean_3264.dtype, np.float32)
assert_equal(y_mean_3264.dtype, np.float32)
assert_equal(X_norm_3264.dtype, np.float32)

assert_equal(Xt_6432.dtype, np.float64)
assert_equal(yt_6432.dtype, np.float64)
assert_equal(X_mean_6432.dtype, np.float64)
assert_equal(y_mean_6432.dtype, np.float64)
assert_equal(X_norm_6432.dtype, np.float64)

assert_equal(X_32.dtype, np.float32)
assert_equal(y_32.dtype, np.float32)
assert_equal(X_64.dtype, np.float64)
assert_equal(y_64.dtype, np.float64)

assert_array_almost_equal(Xt_32, Xt_64)
assert_array_almost_equal(yt_32, yt_64)
assert_array_almost_equal(X_mean_32, X_mean_64)
assert_array_almost_equal(y_mean_32, y_mean_64)
assert_array_almost_equal(X_norm_32, X_norm_64)


def test_rescale_data():
n_samples = 200
n_features = 2
Expand Down
9 changes: 4 additions & 5 deletions sklearn/linear_model/tests/test_coordinate_descent.py
Expand Up @@ -661,12 +661,11 @@ def test_check_input_false():
clf = ElasticNet(selection='cyclic', tol=1e-8)
# Check that no error is raised if data is provided in the right format
clf.fit(X, y, check_input=False)
# With check_input=False, an exhaustive check is not made on y but its
# dtype is still cast in _preprocess_data to X's dtype. So the test should
# pass anyway
X = check_array(X, order='F', dtype='float32')
clf.fit(X, y, check_input=True)
# Check that an error is raised if data is provided in the wrong dtype,
# because of check bypassing
assert_raises(ValueError, clf.fit, X, y, check_input=False)

clf.fit(X, y, check_input=False)
# With no input checking, providing X in C order should result in false
# computation
X = check_array(X, order='C', dtype='float64')
Expand Down

0 comments on commit 89962f0

Please sign in to comment.