In [1]:
from sklearn.datasets import make_regression
import numpy as np
from sklearn.cross_validation import train_test_split
from skopt.learning import RandomForestRegressor



In [2]:
def weighted_variance(y, weights=None):
    w_mean = np.average(y, weights=weights)
    return np.sum(weights * (y - w_mean)**2) / np.sum(weights)

In [3]:
X, y = make_regression(random_state=0, n_samples=500, n_features=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
from sklearn.tree import DecisionTreeRegressor

In [5]:
dtr = DecisionTreeRegressor(max_depth=5, random_state=0)
dtr.fit(X_train, y_train)
mean = dtr.predict(X_test)
var = dtr.tree_.impurity[dtr.apply(X_test)]
var

array([ 10815.45483241,  12956.205579  ,   5572.40159146,   9436.4433567 ,
        11255.62703131,  13334.11489382,  10815.45483241,   5572.40159146,
        13517.96087533,  12956.205579  ,   6363.73406556,  13334.11489382,
        13812.09440682,   5572.40159146,  13334.11489382,   3092.13723901,
         6583.32788853,  13334.11489382,  13812.09440682,  10815.45483241,
         6363.73406556,   9436.4433567 ,   6363.73406556,  13517.96087533,
         3092.13723901,   5572.40159146,  13334.11489382,   5572.40159146,
         6362.51450869,  10936.33609202,   9436.4433567 ,   6363.73406556,
        12956.205579  ,   9436.4433567 ,  13334.11489382,  13334.11489382,
        10234.27688969,   3225.67796591,   5375.30846345,   6363.73406556,
        13334.11489382,   5572.40159146,   3092.13723901,   9436.4433567 ,
        13334.11489382,  13334.11489382,  13334.11489382,   9436.4433567 ,
        13812.09440682,   7758.39046172,   6363.73406556,   9436.4433567 ,
         5572.40159146,  

In [6]:
train_leaf_nodes = dtr.apply(X_train)
test_leaf_nodes = dtr.apply(X_test)

also_mean = np.zeros(X_test.shape[0])
also_var = np.zeros(X_test.shape[0])

for X_ind, leaf_node in enumerate(test_leaf_nodes):
    weights = np.zeros_like(y_train)
    samples_in_tree_mask = train_leaf_nodes == leaf_node
    weights[samples_in_tree_mask] = 1.0 / np.sum(samples_in_tree_mask)
    also_mean[X_ind] = np.average(y_train, weights=weights)
    also_var[X_ind] = weighted_variance(y_train, weights)

In [7]:
print(also_var)

[ 10815.45483241  12956.205579     5572.40159146   9436.4433567
  11255.62703131  13334.11489382  10815.45483241   5572.40159146
  13517.96087533  12956.205579     6363.73406556  13334.11489382
  13812.09440682   5572.40159146  13334.11489382   3092.13723901
   6583.32788853  13334.11489382  13812.09440682  10815.45483241
   6363.73406556   9436.4433567    6363.73406556  13517.96087533
   3092.13723901   5572.40159146  13334.11489382   5572.40159146
   6362.51450869  10936.33609202   9436.4433567    6363.73406556
  12956.205579     9436.4433567   13334.11489382  13334.11489382
  10234.27688969   3225.67796591   5375.30846345   6363.73406556
  13334.11489382   5572.40159146   3092.13723901   9436.4433567
  13334.11489382  13334.11489382  13334.11489382   9436.4433567
  13812.09440682   7758.39046172   6363.73406556   9436.4433567
   5572.40159146  13334.11489382  10234.27688969  13812.09440682
  13812.09440682  13334.11489382   5572.40159146  13517.96087533
   6363.73406556  13812.09440

In [8]:
print(np.allclose(mean, also_mean))
print(np.allclose(var, also_var))

True