# Conditional variances for tree based methods using weights on training data

In [1]:
from sklearn.datasets import make_regression
import numpy as np
from sklearn.model_selection import train_test_split

# Uses the approach as described in http://arxiv.org/pdf/1211.0906v2.pdf
from skopt.learning import RandomForestRegressor

In [2]:
def weighted_variance(y, weights=None):
    w_mean = np.average(y, weights=weights)
    return np.sum(weights * (y - w_mean)**2) / np.sum(weights)

In [3]:
X, y = make_regression(random_state=0, n_samples=500, n_features=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
from sklearn.tree import DecisionTreeRegressor

In [5]:
dtr = DecisionTreeRegressor(max_depth=5, random_state=0)
dtr.fit(X_train, y_train)
mean = dtr.predict(X_test)
var = dtr.tree_.impurity[dtr.apply(X_test)]

In [6]:
def weights_training_data(X_train, X_test, estimator, weights=None):
    """
    Return weights on y_train for a single tree
    as given by formula 4] in 
    http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdfn http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf
    
    For an ensemble it might be useful to set the weights as the cumulative sum
    of the weights across all previous trees.
    """
    train_leaf_nodes = estimator.apply(X_train)
    test_leaf_nodes = estimator.apply(X_test)

    if weights is None:
        weights = np.zeros((X_test.shape[0], X_train.shape[0]))
    for X_ind, leaf_node in enumerate(test_leaf_nodes):
        samples_in_tree_mask = train_leaf_nodes == leaf_node
        weights[X_ind][samples_in_tree_mask] += 1.0 / np.sum(samples_in_tree_mask)

    return weights

In [7]:
weights = weights_training_data(X_train, X_test, dtr)
also_mean = np.zeros(X_test.shape[0])
also_var = np.zeros(X_test.shape[0])

for i in range(X_test.shape[0]):
    also_mean[i] = np.average(y_train, weights=weights[i])
    also_var[i] = weighted_variance(y_train, weights[i])

In [8]:
print(np.allclose(mean, also_mean))
print(np.allclose(var, also_var))

True
True


In [9]:
rf = RandomForestRegressor(random_state=0, n_estimators=100, max_depth=5, bootstrap=False)
rf.fit(X_train, y_train)
rf_mean, rf_std = rf.predict(X_test, return_std=True)
rf_var = rf_std**2

In [10]:
weights = np.zeros((X_test.shape[0], X_train.shape[0]))
also_rf_mean = np.zeros(X_test.shape[0])
also_rf_var = np.zeros(X_test.shape[0])

tree = rf.estimators_[0]
tree.predict(X_test)

for tree in rf.estimators_:
    weights = weights_training_data(X_train, X_test, tree, weights=weights)
    
weights /= len(rf.estimators_)
for i in range(X_test.shape[0]):
    also_rf_mean[i] = np.average(y_train, weights=weights[i])
    also_rf_var[i] = weighted_variance(y_train, weights[i])

In [11]:
print(np.allclose(rf_mean, also_rf_mean))
print(np.allclose(rf_var, also_rf_var))

True
True
