In [1]:
from sklearn.datasets import make_regression
import numpy as np
from sklearn.cross_validation import train_test_split
from skopt.learning import RandomForestRegressor



In [2]:
def weighted_variance(y, weights=None):
    w_mean = np.average(y, weights=weights)
    return np.sum(weights * (y - w_mean)**2) / np.sum(weights)

In [3]:
X, y = make_regression(random_state=0, n_samples=500, n_features=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
from sklearn.tree import DecisionTreeRegressor

In [5]:
dtr = DecisionTreeRegressor(max_depth=5, random_state=0)
dtr.fit(X_train, y_train)
mean = dtr.predict(X_test)
var = dtr.tree_.impurity[dtr.apply(X_test)]

In [6]:
def weights_training_data(X_train, X_test, estimator, weights=None):
    """
    Return weights on y_train for a single tree
    as given by formula 4] in 
    http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdfn http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf
    
    For an ensemble it might be useful to set the weights as the cumulative sum
    of the weights across all previous trees.
    """
    train_leaf_nodes = estimator.apply(X_train)
    test_leaf_nodes = estimator.apply(X_test)

    if weights is None:
        weights = np.zeros((X_test.shape[0], X_train.shape[0]))
    for X_ind, leaf_node in enumerate(test_leaf_nodes):
        samples_in_tree_mask = train_leaf_nodes == leaf_node
        weights[X_ind][samples_in_tree_mask] += 1.0 / np.sum(samples_in_tree_mask)

    return weights

In [7]:
weights = weights_training_data(X_train, X_test, dtr)
also_mean = np.zeros(X_test.shape[0])
also_var = np.zeros(X_test.shape[0])

for i in range(X_test.shape[0]):
    also_mean[i] = np.average(y_train, weights=weights[i])
    also_var[i] = weighted_variance(y_train, weights[i])

In [8]:
print(np.allclose(mean, also_mean))
print(np.allclose(var, also_var))

True
True


In [9]:
rf = RandomForestRegressor(random_state=0, n_estimators=100, max_depth=5, bootstrap=False)
rf.fit(X_train, y_train)
rf_mean, rf_std = rf.predict(X_test, return_std=True)
rf_var = rf_std**2

[-116.16509319   71.6467564   -42.5778626    58.69047161  354.76241583
  117.91240068 -116.16509319  -42.5778626   -77.321321     71.6467564
   54.67083467  117.91240068  166.23400152  -42.5778626   117.91240068
 -418.10262055 -267.50726045  117.91240068  166.23400152 -116.16509319
   54.67083467   58.69047161   54.67083467  -77.321321   -418.10262055
  -42.5778626   117.91240068  -42.5778626  -213.56974772 -195.75384238
   58.69047161   54.67083467   71.6467564    58.69047161  117.91240068
  117.91240068   69.56159921  297.89777748  -14.54337953   54.67083467
  117.91240068  -42.5778626  -418.10262055   58.69047161  117.91240068
  117.91240068  117.91240068   58.69047161  166.23400152 -307.44667198
   54.67083467   58.69047161  -42.5778626   117.91240068   69.56159921
  166.23400152  166.23400152  117.91240068  -42.5778626   -77.321321
   54.67083467  166.23400152 -195.75384238 -116.16509319 -116.16509319
 -213.56974772   58.69047161 -116.16509319   54.67083467 -307.44667198
  -77.321

In [10]:
weights = np.zeros((X_test.shape[0], X_train.shape[0]))
also_rf_mean = np.zeros(X_test.shape[0])
also_rf_var = np.zeros(X_test.shape[0])

tree = rf.estimators_[0]
tree.predict(X_test)

for tree in rf.estimators_:
    weights = weights_training_data(X_train, X_test, tree, weights=weights)
    
weights /= len(rf.estimators_)
for i in range(X_test.shape[0]):
    also_rf_mean[i] = np.average(y_train, weights=weights[i])
    also_rf_var[i] = weighted_variance(y_train, weights[i])

6.8212102633e-13
[ 10815.45483241  12956.205579     5572.40159146   9436.4433567
  11255.62703131  13334.11489382  10815.45483241   5572.40159146
  13517.96087533  12956.205579     6363.73406556  13334.11489382
  13812.09440682   5572.40159146  13334.11489382   3092.13723901
   6583.32788853  13334.11489382  13812.09440682  10815.45483241
   6363.73406556   9436.4433567    6363.73406556  13517.96087533
   3092.13723901   5572.40159146  13334.11489382   5572.40159146
   6362.51450869  10936.33609202   9436.4433567    6363.73406556
  12956.205579     9436.4433567   13334.11489382  13334.11489382
  10234.27688969  10892.48163387   5375.30846345   6363.73406556
  13334.11489382   5572.40159146   3092.13723901   9436.4433567
  13334.11489382  13334.11489382  13334.11489382   9436.4433567
  13812.09440682   7758.39046172   6363.73406556   9436.4433567
   5572.40159146  13334.11489382  10234.27688969  13812.09440682
  13812.09440682  13334.11489382   5572.40159146  13517.96087533
   6363.7340

In [12]:
print(np.allclose(rf_mean, also_rf_mean))
print(np.allclose(rf_var, also_rf_var))

True
True
