From 7f497ef05db90ff941ac1729fcaf8cffa913a61a Mon Sep 17 00:00:00 2001 From: Gabe Schwartz Date: Tue, 29 Jul 2014 11:15:13 -0400 Subject: [PATCH] BUG: SVC fit() w/ class w/ all 0 sample weights. Versions of LibSVM and LibLinear that support sample/instance weights will remove all samples with 0 weight before training. If all samples of some classes have 0 weight, the internal LibXXX model will think it has that many less classes than the number provided to sklearn. This caused issues with, for example, predict_proba returning probabilities with the wrong shape in the last dimension. --- sklearn/svm/base.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index 8a92fe62c31bf..61d53d8c08189 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -18,6 +18,29 @@ LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr'] +def _validate_targets_with_weight(clf, y, sample_weight): + y_ = column_or_1d(y, warn=True) + cls, y = unique(y_, return_inverse=True) + + if sample_weight is not None: + sw = column_or_1d(sample_weight, warn=True) + cls = np.unique(y_[sw > 0]) + + if len(cls) < 2: + raise ValueError( + "The number of classes has to be greater than one; got %d" + % len(cls)) + + # This must be called here so that the class weight list doesn't contain + # weights for classes eliminated because they had no samples with > 0 + # weight. + clf.class_weight_ = compute_class_weight(clf.class_weight, cls, y_) + clf.classes_ = cls + + # LibLinear and LibSVM want targets as doubles, even for classification. + return np.asarray(y, dtype=np.float64, order='C') + + def _one_vs_one_coef(dual_coef, n_support, support_vectors): """Generate primal coefficients from dual coefficients for the one-vs-one multi class LibSVM in the case @@ -135,7 +158,7 @@ def fit(self, X, y, sample_weight=None): self._sparse = sparse and not callable(self.kernel) X = check_array(X, accept_sparse='csr', dtype=np.float64, order='C') - y = self._validate_targets(y) + y = self._validate_targets(y, sample_weight) sample_weight = np.asarray([] if sample_weight is None @@ -185,7 +208,7 @@ def fit(self, X, y, sample_weight=None): self.intercept_ *= -1 return self - def _validate_targets(self, y): + def _validate_targets(self, y, sample_weight=None): """Validation of y and class_weight. Default implementation for SVR and one-class; overridden in BaseSVC. @@ -437,18 +460,8 @@ def coef_(self): class BaseSVC(BaseLibSVM, ClassifierMixin): """ABC for LibSVM-based classifiers.""" - def _validate_targets(self, y): - y_ = column_or_1d(y, warn=True) - cls, y = np.unique(y_, return_inverse=True) - self.class_weight_ = compute_class_weight(self.class_weight, cls, y_) - if len(cls) < 2: - raise ValueError( - "The number of classes has to be greater than one; got %d" - % len(cls)) - - self.classes_ = cls - - return np.asarray(y, dtype=np.float64, order='C') + def _validate_targets(self, y, sample_weight=None): + return _validate_targets_with_weight(self, y, sample_weight) def predict(self, X): """Perform classification on samples in X.