[MRG+2] Tree MAE fix to ensure sample_weights are used during impurit…

…y calculation (scikit-learn#11464) * Fix to allow M * Updated MAE test to consider sample_weights in calculation * Removed comment * Fixed: E501 line too long (82 > 79 characters) * syntax correction * Added fix details * Changed to use consistent datatypes during calculaions * Corrected formatting * Requested Changes * removed explicit casts * Removed unnecessary explicits * Removed unnecessary explicit casts * added additional test * updated comments * Requested changes incl additional unit test * fix mistake * formatting * removed whitespace * added test notes * formatting * Requested changes * Trailing space fix attempt * Trailing whitespace fix attempt #2 * Remove trailing whitespace
NelleV · Jul 17, 2018 · 6ade12a · 6ade12a
1 parent 2e0b274
commit 6ade12a
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 24 deletions.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -547,6 +547,12 @@ Classifiers and regressors
   per-tree basis. The previous behavior over-weighted the Gini importance of
   features that appear in later stages. This issue only affected feature
   importances. :issue:`11176` by :user:`Gil Forsyth <gforsyth>`.
+
+- Fixed a bug in :class:`tree.MAE` to ensure sample weights are being used 
+  during the calculation of tree MAE impurity. Previous behaviour could 
+  cause suboptimal splits to be chosen since the impurity calculation 
+  considered all samples to be of equal weight importance.
+  :issue:`11464` by :user:`John Stott <JohnStott>`.  
 
 Decomposition, manifold learning and clustering
 

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
@@ -1238,21 +1238,24 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
-        cdef DOUBLE_t w_y_ik
-
-        cdef double impurity = 0.0
+        cdef DOUBLE_t w = 1.0
+        cdef DOUBLE_t impurity = 0.0
 
         for k in range(self.n_outputs):
             for p in range(self.start, self.end):
                 i = samples[p]
 
                 y_ik = y[i * self.y_stride + k]
 
-                impurity += <double> fabs((<double> y_ik) - <double> self.node_medians[k])
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                impurity += fabs(y_ik - self.node_medians[k]) * w
+
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
-    cdef void children_impurity(self, double* impurity_left,
-                                double* impurity_right) nogil:
+    cdef void children_impurity(self, double* p_impurity_left,
+                                double* p_impurity_right) nogil:
         """Evaluate the impurity in children nodes, i.e. the impurity of the
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end]).
@@ -1269,23 +1272,26 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t median
+        cdef DOUBLE_t w = 1.0
+        cdef DOUBLE_t impurity_left = 0.0
+        cdef DOUBLE_t impurity_right = 0.0
 
         cdef void** left_child = <void**> self.left_child.data
         cdef void** right_child = <void**> self.right_child.data
 
-        impurity_left[0] = 0.0
-        impurity_right[0] = 0.0
-
         for k in range(self.n_outputs):
             median = (<WeightedMedianCalculator> left_child[k]).get_median()
             for p in range(start, pos):
                 i = samples[p]
 
                 y_ik = y[i * self.y_stride + k]
 
-                impurity_left[0] += <double>fabs((<double> y_ik) -
-                                                 <double> median)
-        impurity_left[0] /= <double>((self.weighted_n_left) * self.n_outputs)
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                impurity_left += fabs(y_ik - median) * w
+        p_impurity_left[0] = impurity_left / (self.weighted_n_left * 
+                                              self.n_outputs)
 
         for k in range(self.n_outputs):
             median = (<WeightedMedianCalculator> right_child[k]).get_median()
@@ -1294,10 +1300,12 @@ cdef class MAE(RegressionCriterion):
 
                 y_ik = y[i * self.y_stride + k]
 
-                impurity_right[0] += <double>fabs((<double> y_ik) -
-                                                  <double> median)
-        impurity_right[0] /= <double>((self.weighted_n_right) *
-                                      self.n_outputs)
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                impurity_right += fabs(y_ik - median) * w
+        p_impurity_right[0] = impurity_right / (self.weighted_n_right * 
+                                                self.n_outputs)
 
 
 cdef class FriedmanMSE(MSE):

diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
@@ -18,6 +18,7 @@
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import mean_squared_error
 
+from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_almost_equal
@@ -1693,19 +1694,101 @@ def test_no_sparse_y_support(name):
 
 
 def test_mae():
-    # check MAE criterion produces correct results
-    # on small toy dataset
+    """Check MAE criterion produces correct results on small toy dataset:
+
+    ------------------
+    | X | y | weight |
+    ------------------
+    | 3 | 3 |  0.1   |
+    | 5 | 3 |  0.3   |
+    | 8 | 4 |  1.0   |
+    | 3 | 6 |  0.6   |
+    | 5 | 7 |  0.3   |
+    ------------------
+    |sum wt:|  2.3   |
+    ------------------
+
+    Because we are dealing with sample weights, we cannot find the median by
+    simply choosing/averaging the centre value(s), instead we consider the
+    median where 50% of the cumulative weight is found (in a y sorted data set)
+    . Therefore with regards to this test data, the cumulative weight is >= 50%
+    when y = 4.  Therefore:
+    Median = 4
+
+    For all the samples, we can get the total error by summing:
+    Absolute(Median - y) * weight
+
+    I.e., total error = (Absolute(4 - 3) * 0.1)
+                      + (Absolute(4 - 3) * 0.3)
+                      + (Absolute(4 - 4) * 1.0)
+                      + (Absolute(4 - 6) * 0.6)
+                      + (Absolute(4 - 7) * 0.3)
+                      = 2.5
+
+    Impurity = Total error / total weight
+             = 2.5 / 2.3
+             = 1.08695652173913
+             ------------------
+
+    From this root node, the next best split is between X values of 3 and 5.
+    Thus, we have left and right child nodes:
+
+    LEFT                    RIGHT
+    ------------------      ------------------
+    | X | y | weight |      | X | y | weight |
+    ------------------      ------------------
+    | 3 | 3 |  0.1   |      | 5 | 3 |  0.3   |
+    | 3 | 6 |  0.6   |      | 8 | 4 |  1.0   |
+    ------------------      | 5 | 7 |  0.3   |
+    |sum wt:|  0.7   |      ------------------
+    ------------------      |sum wt:|  1.6   |
+                            ------------------
+
+    Impurity is found in the same way:
+    Left node Median = 6
+    Total error = (Absolute(6 - 3) * 0.1)
+                + (Absolute(6 - 6) * 0.6)
+                = 0.3
+
+    Left Impurity = Total error / total weight
+            = 0.3 / 0.7
+            = 0.428571428571429
+            -------------------
+
+    Likewise for Right node:
+    Right node Median = 4
+    Total error = (Absolute(4 - 3) * 0.3)
+                + (Absolute(4 - 4) * 1.0)
+                + (Absolute(4 - 7) * 0.3)
+                = 1.2
+
+    Right Impurity = Total error / total weight
+            = 1.2 / 1.6
+            = 0.75
+            ------
+    """
     dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae",
                                    max_leaf_nodes=2)
-    dt_mae.fit([[3], [5], [3], [8], [5]], [6, 7, 3, 4, 3])
-    assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0/3.0])
-    assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
 
-    dt_mae.fit([[3], [5], [3], [8], [5]], [6, 7, 3, 4, 3],
-               [0.6, 0.3, 0.1, 1.0, 0.3])
-    assert_array_equal(dt_mae.tree_.impurity, [7.0/2.3, 3.0/0.7, 4.0/1.6])
+    # Test MAE where sample weights are non-uniform (as illustrated above):
+    dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
+               sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3])
+    assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6])
     assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])
 
+    # Test MAE where all sample weights are uniform:
+    dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
+               sample_weight=np.ones(5))
+    assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
+    assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
+
+    # Test MAE where a `sample_weight` is not explicitly provided.
+    # This is equivalent to providing uniform sample weights, though
+    # the internal logic is different:
+    dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3])
+    assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
+    assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
+
 
 def test_criterion_copy():
     # Let's check whether copy of our criterion has the same type