Skip to content

Commit

Permalink
[MRG+2] Tree MAE fix to ensure sample_weights are used during impurit…
Browse files Browse the repository at this point in the history
…y calculation (scikit-learn#11464)

* Fix to allow M

* Updated MAE test to consider sample_weights in calculation

* Removed comment

* Fixed: E501 line too long (82 > 79 characters)

* syntax correction

* Added fix details

* Changed to use consistent datatypes during calculaions

* Corrected formatting

* Requested Changes

* removed explicit casts

* Removed unnecessary explicits

* Removed unnecessary explicit casts

* added additional test

* updated comments

* Requested changes incl additional unit test

* fix mistake

* formatting

* removed whitespace

* added test notes

* formatting

* Requested changes

* Trailing space fix attempt

* Trailing whitespace fix attempt #2

* Remove trailing whitespace
  • Loading branch information
JohnStott authored and GaelVaroquaux committed Jul 17, 2018
1 parent 2e0b274 commit 6ade12a
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 24 deletions.
6 changes: 6 additions & 0 deletions doc/whats_new/v0.20.rst
Expand Up @@ -547,6 +547,12 @@ Classifiers and regressors
per-tree basis. The previous behavior over-weighted the Gini importance of
features that appear in later stages. This issue only affected feature
importances. :issue:`11176` by :user:`Gil Forsyth <gforsyth>`.

- Fixed a bug in :class:`tree.MAE` to ensure sample weights are being used
during the calculation of tree MAE impurity. Previous behaviour could
cause suboptimal splits to be chosen since the impurity calculation
considered all samples to be of equal weight importance.
:issue:`11464` by :user:`John Stott <JohnStott>`.

Decomposition, manifold learning and clustering

Expand Down
40 changes: 24 additions & 16 deletions sklearn/tree/_criterion.pyx
Expand Up @@ -1238,21 +1238,24 @@ cdef class MAE(RegressionCriterion):
cdef SIZE_t* samples = self.samples
cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t w_y_ik

cdef double impurity = 0.0
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t impurity = 0.0

for k in range(self.n_outputs):
for p in range(self.start, self.end):
i = samples[p]

y_ik = y[i * self.y_stride + k]

impurity += <double> fabs((<double> y_ik) - <double> self.node_medians[k])
if sample_weight != NULL:
w = sample_weight[i]

impurity += fabs(y_ik - self.node_medians[k]) * w

return impurity / (self.weighted_n_node_samples * self.n_outputs)

cdef void children_impurity(self, double* impurity_left,
double* impurity_right) nogil:
cdef void children_impurity(self, double* p_impurity_left,
double* p_impurity_right) nogil:
"""Evaluate the impurity in children nodes, i.e. the impurity of the
left child (samples[start:pos]) and the impurity the right child
(samples[pos:end]).
Expand All @@ -1269,23 +1272,26 @@ cdef class MAE(RegressionCriterion):
cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t median
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t impurity_left = 0.0
cdef DOUBLE_t impurity_right = 0.0

cdef void** left_child = <void**> self.left_child.data
cdef void** right_child = <void**> self.right_child.data

impurity_left[0] = 0.0
impurity_right[0] = 0.0

for k in range(self.n_outputs):
median = (<WeightedMedianCalculator> left_child[k]).get_median()
for p in range(start, pos):
i = samples[p]

y_ik = y[i * self.y_stride + k]

impurity_left[0] += <double>fabs((<double> y_ik) -
<double> median)
impurity_left[0] /= <double>((self.weighted_n_left) * self.n_outputs)
if sample_weight != NULL:
w = sample_weight[i]

impurity_left += fabs(y_ik - median) * w
p_impurity_left[0] = impurity_left / (self.weighted_n_left *
self.n_outputs)

for k in range(self.n_outputs):
median = (<WeightedMedianCalculator> right_child[k]).get_median()
Expand All @@ -1294,10 +1300,12 @@ cdef class MAE(RegressionCriterion):

y_ik = y[i * self.y_stride + k]

impurity_right[0] += <double>fabs((<double> y_ik) -
<double> median)
impurity_right[0] /= <double>((self.weighted_n_right) *
self.n_outputs)
if sample_weight != NULL:
w = sample_weight[i]

impurity_right += fabs(y_ik - median) * w
p_impurity_right[0] = impurity_right / (self.weighted_n_right *
self.n_outputs)


cdef class FriedmanMSE(MSE):
Expand Down
99 changes: 91 additions & 8 deletions sklearn/tree/tests/test_tree.py
Expand Up @@ -18,6 +18,7 @@
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_almost_equal
Expand Down Expand Up @@ -1693,19 +1694,101 @@ def test_no_sparse_y_support(name):


def test_mae():
# check MAE criterion produces correct results
# on small toy dataset
"""Check MAE criterion produces correct results on small toy dataset:
------------------
| X | y | weight |
------------------
| 3 | 3 | 0.1 |
| 5 | 3 | 0.3 |
| 8 | 4 | 1.0 |
| 3 | 6 | 0.6 |
| 5 | 7 | 0.3 |
------------------
|sum wt:| 2.3 |
------------------
Because we are dealing with sample weights, we cannot find the median by
simply choosing/averaging the centre value(s), instead we consider the
median where 50% of the cumulative weight is found (in a y sorted data set)
. Therefore with regards to this test data, the cumulative weight is >= 50%
when y = 4. Therefore:
Median = 4
For all the samples, we can get the total error by summing:
Absolute(Median - y) * weight
I.e., total error = (Absolute(4 - 3) * 0.1)
+ (Absolute(4 - 3) * 0.3)
+ (Absolute(4 - 4) * 1.0)
+ (Absolute(4 - 6) * 0.6)
+ (Absolute(4 - 7) * 0.3)
= 2.5
Impurity = Total error / total weight
= 2.5 / 2.3
= 1.08695652173913
------------------
From this root node, the next best split is between X values of 3 and 5.
Thus, we have left and right child nodes:
LEFT RIGHT
------------------ ------------------
| X | y | weight | | X | y | weight |
------------------ ------------------
| 3 | 3 | 0.1 | | 5 | 3 | 0.3 |
| 3 | 6 | 0.6 | | 8 | 4 | 1.0 |
------------------ | 5 | 7 | 0.3 |
|sum wt:| 0.7 | ------------------
------------------ |sum wt:| 1.6 |
------------------
Impurity is found in the same way:
Left node Median = 6
Total error = (Absolute(6 - 3) * 0.1)
+ (Absolute(6 - 6) * 0.6)
= 0.3
Left Impurity = Total error / total weight
= 0.3 / 0.7
= 0.428571428571429
-------------------
Likewise for Right node:
Right node Median = 4
Total error = (Absolute(4 - 3) * 0.3)
+ (Absolute(4 - 4) * 1.0)
+ (Absolute(4 - 7) * 0.3)
= 1.2
Right Impurity = Total error / total weight
= 1.2 / 1.6
= 0.75
------
"""
dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae",
max_leaf_nodes=2)
dt_mae.fit([[3], [5], [3], [8], [5]], [6, 7, 3, 4, 3])
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0/3.0])
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])

dt_mae.fit([[3], [5], [3], [8], [5]], [6, 7, 3, 4, 3],
[0.6, 0.3, 0.1, 1.0, 0.3])
assert_array_equal(dt_mae.tree_.impurity, [7.0/2.3, 3.0/0.7, 4.0/1.6])
# Test MAE where sample weights are non-uniform (as illustrated above):
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3])
assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6])
assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])

# Test MAE where all sample weights are uniform:
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
sample_weight=np.ones(5))
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])

# Test MAE where a `sample_weight` is not explicitly provided.
# This is equivalent to providing uniform sample weights, though
# the internal logic is different:
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3])
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])


def test_criterion_copy():
# Let's check whether copy of our criterion has the same type
Expand Down

0 comments on commit 6ade12a

Please sign in to comment.