In [151]:
RANDOM_STATE = 42

## Vanilla bagging

In [152]:
import numpy as np
from sklearn.base import clone
from sklearn.utils import check_random_state
from sklearn.ensemble.base import BaseEnsemble

        
class BaggingRegressor(BaseEnsemble):
    
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 random_state=None):
        super().__init__(base_estimator=base_estimator, n_estimators=n_estimators)
        
        self.random_state = random_state
        
    def fit(self, X, y):
        
        random_state = check_random_state(self.random_state)
        
        n, p = X.shape
        self.estimators_ = []
        self.estimators_features_ = []
        
        for i in range(self.n_estimators):
        
            # Sample observations and features
            rows = random_state.randint(0, n, n)
            cols = random_state.randint(0, p, p)
            X_fit = (X[rows])[:, cols]
            y_fit = y[rows]
        
            # Clone the estimator
            estimator = clone(self.base_estimator)
            
            # Fit the estimator
            estimator.fit(X_fit, y_fit)
            
            # Store the estimator and the associate feature indexes
            self.estimators_.append(estimator)
            self.estimators_features_.append(cols)
        
    def predict(self, X):
        
        # Get each estimator's predictions
        y_pred_all = np.array([
            estimator.predict(X[:, cols])
            for estimator, cols in zip(self.estimators_, self.estimators_features_)
        ])
        
        # Average the predictions
        y_pred = np.average(y_pred_all, axis=0)
        
        return y_pred

        
from sklearn import datasets
from sklearn import model_selection
from sklearn import tree

from sklearn import ensemble
        

X, y = datasets.load_boston(return_X_y=True)

n_estimators = 20

models = {
    'Decision tree': tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
    'Bagging decision tree': BaggingRegressor(
        base_estimator=tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
    'Random forest': ensemble.RandomForestRegressor(
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
}

cv = model_selection.KFold(n_splits=10, random_state=RANDOM_STATE)

for name, estimator in models.items():
    scores = model_selection.cross_val_score(estimator, X, y, cv=cv, scoring='neg_mean_squared_error')
    print('MSE: {:.3f} (± {:.3f}) [{}]'.format(-np.mean(scores), np.std(scores), name))

MSE: 39.998 (± 29.146) [Decision tree]
MSE: 21.933 (± 22.682) [Bagging decision tree]
MSE: 21.934 (± 23.471) [Random forest]


In [163]:
import numpy as np
from sklearn import metrics
from sklearn.base import clone
from sklearn.ensemble.base import BaseEnsemble
        
        
class WeightedBaggingRegressor(BaseBagging):
    
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 random_state=None,
                 oob_metric=metrics.mean_squared_error):
        
        super().__init__(base_estimator=base_estimator, n_estimators=n_estimators)
        
        self.random_state = random_state
        self.oob_metric = oob_metric
        
    def fit(self, X, y):
     
        random_state = check_random_state(self.random_state)
        
        n, p = X.shape
        self.estimators_ = []
        self.estimators_features_ = []
        self.oob_scores_ = []
        
        for i in range(self.n_estimators):
        
            # Sample observations and features
            rows = random_state.randint(0, n, n)
            cols = random_state.randint(0, p, p)
            oob_rows = ~rows
            X_fit = (X[rows])[:, cols]
            y_fit = y[rows]
            X_val = (X[oob_rows])[:, cols]
            y_val = y[oob_rows]
        
            # Clone the estimator
            estimator = clone(self.base_estimator)
            
            # Fit the estimator
            estimator.fit(X_fit, y_fit)
            
            # Store the estimator and the associate feature indexes
            self.estimators_.append(estimator)
            self.estimators_features_.append(cols)
            
            # Store the OOB score
            oob_score = self.oob_metric(y_val, estimator.predict(X_val))
            self.oob_scores_.append(oob_score)
        
    def predict(self, X):
        
        # Get each estimator's predictions
        y_pred_all = np.array([
            estimator.predict(X[:, cols])
            for estimator, cols in zip(self.estimators_, self.estimators_features_)
        ])
        
        # Average the predictions
        weights = 1 / np.array(self.oob_scores_)
        y_pred = np.average(y_pred_all, axis=0, weights=weights)
        
        return y_pred

        
from sklearn import datasets
from sklearn import model_selection
from sklearn import tree

from sklearn import ensemble
        

X, y = datasets.load_boston(return_X_y=True)

n_estimators = 20

models = {
    'Decision tree': tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
    'Bagging decision tree': BaggingRegressor(
        base_estimator=tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
    'Weighted bagging decision tree': WeightedBaggingRegressor(
        base_estimator=tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
    'Random forest': ensemble.RandomForestRegressor(
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
}

cv = model_selection.KFold(n_splits=10, random_state=RANDOM_STATE)


for name, estimator in models.items():
    scores = model_selection.cross_val_score(estimator, X, y, cv=cv, scoring='neg_mean_squared_error')
    print('MSE: {:.3f} (± {:.3f}) [{}]'.format(-np.mean(scores), np.std(scores), name))

MSE: 39.998 (± 29.146) [Decision tree]
MSE: 21.933 (± 22.682) [Bagging decision tree]
MSE: 22.828 (± 24.625) [Weighted bagging decision tree]
MSE: 21.934 (± 23.471) [Random forest]


## Localised weighted bagging

In [221]:
import numpy as np
from sklearn import metrics
from sklearn import neighbors
from sklearn.base import clone
from sklearn.ensemble.base import BaseEnsemble

        
class LocalisedWeightedBaggingRegressor(BaseBagging):
    
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 random_state=None,
                 local_metric=metrics.mean_squared_error,
                 neighbors=neighbors.NearestNeighbors(n_neighbors=10, algorithm='brute')):
        
        super().__init__(base_estimator=base_estimator, n_estimators=n_estimators)
        
        self.random_state = random_state
        self.local_metric = local_metric
        self.neighbors = neighbors
        
    def fit(self, X, y):
     
        random_state = check_random_state(self.random_state)
        
        self.X_check = X.copy()
        self.y_check = y.copy()
        self.neighbors.fit(self.X_check)
        
        n, p = X.shape
        self.estimators_ = []
        self.estimators_features_ = []
        
        for i in range(self.n_estimators):
        
            # Sample observations and features
            rows = random_state.randint(0, n, n)
            cols = random_state.randint(0, p, p)
            X_fit = (X[rows])[:, cols]
            y_fit = y[rows]
            
            # Clone the estimator
            estimator = clone(self.base_estimator)
            
            # Fit the estimator
            estimator.fit(X_fit, y_fit)
            
            # Store the estimator and the associate feature indexes
            self.estimators_.append(estimator)
            self.estimators_features_.append(cols)
        
    def predict(self, X):
        
        distances, neighbors_idxs = self.neighbors.kneighbors(X)
        y_pred = []
        
        for i, x in enumerate(X):
            
            X_neigh = self.X_check[neighbors_idxs[i]]
            y_neigh = self.y_check[neighbors_idxs[i]]
            
            weights = 1 / (1 + np.array([
                self.local_metric(y_neigh, estimator.predict(X_neigh[:, self.estimators_features_[j]]))
                for j, estimator in enumerate(self.estimators_)
            ]))
            
            y_pred_all = np.array([
                estimator.predict(x[self.estimators_features_[j]].reshape(1, -1))
                for j, estimator in enumerate(self.estimators_)
            ]).reshape(1, -1)[0]
            
            y_pred.append(np.average(y_pred_all, axis=0, weights=weights))
            
        return y_pred

        
from sklearn import datasets
from sklearn import model_selection
from sklearn import tree

from sklearn import ensemble
        

X, y = datasets.load_boston(return_X_y=True)

n_estimators = 20

models = {
    'Decision tree': tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
    'Bagging decision tree': BaggingRegressor(
        base_estimator=tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
    'Weighted bagging decision tree': WeightedBaggingRegressor(
        base_estimator=tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
    'Localised weighted bagging decision tree': LocalisedWeightedBaggingRegressor(
        base_estimator=tree.DecisionTreeRegressor(random_state=RANDOM_STATE),
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
    'Random forest': ensemble.RandomForestRegressor(
        n_estimators=n_estimators,
        random_state=RANDOM_STATE
    ),
}




cv = model_selection.KFold(n_splits=10, random_state=RANDOM_STATE)

for name, estimator in models.items():
    scores = model_selection.cross_val_score(estimator, X, y, cv=cv, scoring='neg_mean_squared_error')
    print('MSE: {:.3f} (± {:.3f}) [{}]'.format(-np.mean(scores), np.std(scores), name))

MSE: 39.998 (± 29.146) [Decision tree]
MSE: 21.933 (± 22.682) [Bagging decision tree]
MSE: 22.828 (± 24.625) [Weighted bagging decision tree]
MSE: 20.570 (± 20.694) [Localised weighted bagging decision tree]
MSE: 21.934 (± 23.471) [Random forest]


In [180]:
a = model.neighbors.kneighbors(X)

In [186]:
a[1][1]

array([ 1, 47,  2, 49, 87])