In [1]:
from sklearn.datasets import make_regression
import numpy as np
from sklearn.cross_validation import train_test_split
from skopt.learning import RandomForestRegressor



In [2]:
def weighted_variance(y, weights=None):
    w_mean = np.average(y, weights=weights)
    return np.sum(weights * (y - w_mean)**2) / np.sum(weights)

In [3]:
X, y = make_regression(random_state=0, n_samples=500, n_features=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
from sklearn.tree import DecisionTreeRegressor

In [5]:
dtr = DecisionTreeRegressor(max_depth=5, random_state=0)
dtr.fit(X_train, y_train)
mean = dtr.predict(X_test)
var = dtr.tree_.impurity[dtr.apply(X_test)]

In [6]:
def weights_training_data(X_train, X_test, tree, weights=None):
    """
    Return weights on y_train for a single tree
    as given by formula 4] in 
    http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdfn http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf
    
    For an ensemble it might be usef
    """
    train_leaf_nodes = dtr.apply(X_train)
    test_leaf_nodes = dtr.apply(X_test)

    if weights is None:
        weights = np.zeros((X_test.shape[0], X_train.shape[0]))
    for X_ind, leaf_node in enumerate(test_leaf_nodes):
        samples_in_tree_mask = train_leaf_nodes == leaf_node
        weights[X_ind][samples_in_tree_mask] += 1.0 / np.sum(samples_in_tree_mask)

    return weights

In [7]:
weights = weights_training_data(X_train, X_test, dtr)
also_mean = np.zeros(X_test.shape[0])
also_var = np.zeros(X_test.shape[0])

for i in range(X_test.shape[0]):
    also_mean[i] = np.average(y_train, weights=weights[i])
    also_var[i] = weighted_variance(y_train, weights[i])

In [8]:
print(np.allclose(mean, also_mean))
print(np.allclose(var, also_var))

True
True
