[MRG+1] _preprocess_data consistent with fused types (scikit-learn#9093)

* add test for _preprocess_data and make it consistent * fix pep8 * add doc, cast systematically y in X.dtype and update test_coordinate_descent.py * test if input values don't change with copy=True * test if input values don't change with copy=True #2 * fix doc * fix doc #2 * fix doc #3
NelleV · Jun 23, 2017 · 89962f0 · 89962f0
1 parent d15128b
commit 89962f0
Show file tree

Hide file tree

Showing 9 changed files with 89 additions and 20 deletions.
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -158,20 +158,21 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
     coordinate_descend).
 
     This is here because nearly all linear models will want their data to be
-    centered.
+    centered. This function also systematically makes y consistent with X.dtype
     """
 
     if isinstance(sample_weight, numbers.Number):
         sample_weight = None
 
     X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
                     dtype=FLOAT_DTYPES)
+    y = np.asarray(y, dtype=X.dtype)
 
     if fit_intercept:
         if sp.issparse(X):
             X_offset, X_var = mean_variance_axis(X, axis=0)
             if not return_mean:
-                X_offset[:] = 0
+                X_offset[:] = X.dtype.type(0)
 
             if normalize:
 
@@ -201,7 +202,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
     else:
         X_offset = np.zeros(X.shape[1], dtype=X.dtype)
         X_scale = np.ones(X.shape[1], dtype=X.dtype)
-        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
+        if y.ndim == 1:
+            y_offset = X.dtype.type(0)
+        else:
+            y_offset = np.zeros(y.shape[1], dtype=X.dtype)
 
     return X, y, X_offset, y_offset, X_scale
 
@@ -460,7 +464,7 @@ def fit(self, X, y, sample_weight=None):
             Training data
 
         y : numpy array of shape [n_samples, n_targets]
-            Target values
+            Target values. Will be cast to X's dtype if necessary
 
         sample_weight : numpy array of shape [n_samples]
             Individual weights for each sample

diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
@@ -148,7 +148,7 @@ def fit(self, X, y):
         X : numpy array of shape [n_samples,n_features]
             Training data
         y : numpy array of shape [n_samples]
-            Target values
+            Target values. Will be cast to X's dtype if necessary
 
         Returns
         -------
@@ -420,7 +420,7 @@ def fit(self, X, y):
             Training vector, where n_samples in the number of samples and
             n_features is the number of features.
         y : array, shape = [n_samples]
-            Target values (integers)
+            Target values (integers). Will be cast to X's dtype if necessary
 
         Returns
         -------

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
@@ -653,7 +653,7 @@ def fit(self, X, y, check_input=True):
             Data
 
         y : ndarray, shape (n_samples,) or (n_samples, n_targets)
-            Target
+            Target. Will be cast to X's dtype if necessary
 
         check_input : boolean, (default=True)
             Allow to bypass several input checking.
@@ -1680,7 +1680,7 @@ def fit(self, X, y):
         X : ndarray, shape (n_samples, n_features)
             Data
         y : ndarray, shape (n_samples, n_tasks)
-            Target
+            Target. Will be cast to X's dtype if necessary
 
         Notes
         -----

diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
@@ -1455,7 +1455,7 @@ def fit(self, X, y, copy_X=True):
             training data.
 
         y : array-like, shape (n_samples,)
-            target values.
+            target values. Will be cast to X's dtype if necessary
 
         copy_X : boolean, optional, default True
             If ``True``, X will be copied; else, it may be overwritten.

diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
@@ -617,7 +617,7 @@ def fit(self, X, y):
             Training data.
 
         y : array-like, shape (n_samples,) or (n_samples, n_targets)
-            Target values.
+            Target values. Will be cast to X's dtype if necessary
 
 
         Returns
@@ -835,7 +835,7 @@ def fit(self, X, y):
             Training data.
 
         y : array-like, shape [n_samples]
-            Target values.
+            Target values. Will be cast to X's dtype if necessary
 
         Returns
         -------

diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
@@ -82,7 +82,7 @@ def fit(self, X, y):
             Training data.
 
         y : array-like, shape = [n_samples]
-            Target values.
+            Target values. Will be cast to X's dtype if necessary
 
         Returns
         -------

diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
@@ -975,7 +975,7 @@ def fit(self, X, y, sample_weight=None):
             Training data
 
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
-            Target values
+            Target values. Will be cast to X's dtype if necessary
 
         sample_weight : float or array-like of shape [n_samples]
             Sample weight
@@ -1094,7 +1094,7 @@ def fit(self, X, y, sample_weight=None):
             Training data
 
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
-            Target values
+            Target values. Will be cast to X's dtype if necessary
 
         sample_weight : float or array-like of shape [n_samples]
             Sample weight
@@ -1336,7 +1336,7 @@ def fit(self, X, y, sample_weight=None):
             and n_features is the number of features.
 
         y : array-like, shape (n_samples,)
-            Target values.
+            Target values. Will be cast to X's dtype if necessary
 
         sample_weight : float or numpy array of shape (n_samples,)
             Sample weight.

diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
@@ -324,6 +324,72 @@ def test_csr_preprocess_data():
     assert_equal(csr_.getformat(), 'csr')
 
 
+def test_dtype_preprocess_data():
+    n_samples = 200
+    n_features = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    X_32 = np.asarray(X, dtype=np.float32)
+    y_32 = np.asarray(y, dtype=np.float32)
+    X_64 = np.asarray(X, dtype=np.float64)
+    y_64 = np.asarray(y, dtype=np.float64)
+
+    for fit_intercept in [True, False]:
+        for normalize in [True, False]:
+
+            Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
+                X_32, y_32, fit_intercept=fit_intercept, normalize=normalize,
+                return_mean=True)
+
+            Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
+                X_64, y_64, fit_intercept=fit_intercept, normalize=normalize,
+                return_mean=True)
+
+            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = (
+                _preprocess_data(X_32, y_64, fit_intercept=fit_intercept,
+                                 normalize=normalize, return_mean=True))
+
+            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = (
+                _preprocess_data(X_64, y_32, fit_intercept=fit_intercept,
+                                 normalize=normalize, return_mean=True))
+
+            assert_equal(Xt_32.dtype, np.float32)
+            assert_equal(yt_32.dtype, np.float32)
+            assert_equal(X_mean_32.dtype, np.float32)
+            assert_equal(y_mean_32.dtype, np.float32)
+            assert_equal(X_norm_32.dtype, np.float32)
+
+            assert_equal(Xt_64.dtype, np.float64)
+            assert_equal(yt_64.dtype, np.float64)
+            assert_equal(X_mean_64.dtype, np.float64)
+            assert_equal(y_mean_64.dtype, np.float64)
+            assert_equal(X_norm_64.dtype, np.float64)
+
+            assert_equal(Xt_3264.dtype, np.float32)
+            assert_equal(yt_3264.dtype, np.float32)
+            assert_equal(X_mean_3264.dtype, np.float32)
+            assert_equal(y_mean_3264.dtype, np.float32)
+            assert_equal(X_norm_3264.dtype, np.float32)
+
+            assert_equal(Xt_6432.dtype, np.float64)
+            assert_equal(yt_6432.dtype, np.float64)
+            assert_equal(X_mean_6432.dtype, np.float64)
+            assert_equal(y_mean_6432.dtype, np.float64)
+            assert_equal(X_norm_6432.dtype, np.float64)
+
+            assert_equal(X_32.dtype, np.float32)
+            assert_equal(y_32.dtype, np.float32)
+            assert_equal(X_64.dtype, np.float64)
+            assert_equal(y_64.dtype, np.float64)
+
+            assert_array_almost_equal(Xt_32, Xt_64)
+            assert_array_almost_equal(yt_32, yt_64)
+            assert_array_almost_equal(X_mean_32, X_mean_64)
+            assert_array_almost_equal(y_mean_32, y_mean_64)
+            assert_array_almost_equal(X_norm_32, X_norm_64)
+
+
 def test_rescale_data():
     n_samples = 200
     n_features = 2

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -661,12 +661,11 @@ def test_check_input_false():
     clf = ElasticNet(selection='cyclic', tol=1e-8)
     # Check that no error is raised if data is provided in the right format
     clf.fit(X, y, check_input=False)
+    # With check_input=False, an exhaustive check is not made on y but its
+    # dtype is still cast in _preprocess_data to X's dtype. So the test should
+    # pass anyway
     X = check_array(X, order='F', dtype='float32')
-    clf.fit(X, y, check_input=True)
-    # Check that an error is raised if data is provided in the wrong dtype,
-    # because of check bypassing
-    assert_raises(ValueError, clf.fit, X, y, check_input=False)
-
+    clf.fit(X, y, check_input=False)
     # With no input checking, providing X in C order should result in false
     # computation
     X = check_array(X, order='C', dtype='float64')