In [None]:
import numpy as np
import pandas as pd
# import clone
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.datasets import load_breast_cancer

# import cross_validate
from sklearn.model_selection import cross_validate, StratifiedKFold

In [None]:
class BaggedTreeClassifier(object):
        def __init__(self, n_elements=100):
                self.n_elements = n_elements
                self.models = []

        def __del__(self):
                del self.n_elements
                del self.models

        def __make_bootsraps(self, data):
                dc = {}
                unip = 0
                b_size = data.shape[0]
                idx = [i for i in range(b_size)]
                for b in range (self.n_elements):
                        sidx = np.random.choice(idx, b_size, replace=True)
                        b_samp = data[sidx, :]
                        unip += len(set(sidx))
                        oidx = list(set(idx) - set(sidx))
                        o_samp = np.array([])
                        if oidx:
                                o_samp = data[oidx, :]
                        dc['boot_'+str(b)] = {'boot' : b_samp,'test' : o_samp}
                return dc


        def get_params(self, deep=True):
                return {'n_elements': self.n_elements}

        def fit(self, X_train, y_train, print_metrics=False):
                training_data = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1)
                dcBoot = self.__make_bootsraps(training_data)

                accs = np.array([])
                pres = np.array([])
                recs = np.array([])

                cls = DecisionTreeClassifier(class_weight='balanced')
                for b in dcBoot:
                        model = clone(cls)
                        model.fit(dcBoot[b]['boot'][:, :-1], dcBoot[b]['boot'][:, -1].reshape(-1, 1))

                        self.models.append(model)

                        if dcBoot[b]['test'].size:
                                yp = model.predict(dcBoot[b]['test'][:, :-1])
                                acc = accuracy_score(dcBoot[b]['test'][:, -1], yp)
                                pre = precision_score(dcBoot[b]['test'][:, -1], yp)
                                rec = recall_score(dcBoot[b]['test'][:, -1], yp)

                                accs = np.concatenate((accs, acc.flatten()))
                                pres = np.concatenate((pres, pre.flatten()))
                                recs = np.concatenate((recs, rec.flatten()))

                        if print_metrics:
                                print('Standard error in accuracy: ', np.std(accs))
                                print('Standard error in precision: ', np.std(pres))
                                print('Standard error in recall: ', np.std(recs))

        def predict(self, X):
                if not self.models:
                        print('No models found. Please train the model first.')
                        return None
                predictions = []
                for m in self.models:
                        yp = m.predict(X)
                        predictions.append(yp.reshape(-1, 1))
                ypred = np.round(np.mean(np.concatenate(predictions, axis=1), axis=1))
                return ypred



In [None]:
data = load_breast_cancer()
X = data.data
y = data.target

In [None]:
ens = BaggedTreeClassifier()
ens.fit(X, y, print_metrics=True)

Standard error in accuracy:  0.0
Standard error in precision:  0.0
Standard error in recall:  0.0
Standard error in accuracy:  0.02271824237466402
Standard error in precision:  0.004601648351648391
Standard error in recall:  0.04162531017369725
Standard error in accuracy:  0.02073421410406947
Standard error in precision:  0.0049057604636934295
Standard error in recall:  0.0373723509823977
Standard error in accuracy:  0.02069528593160894
Standard error in precision:  0.00648003198994401
Standard error in recall:  0.03464406409881267
Standard error in accuracy:  0.021437593538067738
Standard error in precision:  0.006486587056846458
Standard error in recall:  0.03791430916487297
Standard error in accuracy:  0.020417738555164083
Standard error in precision:  0.005948606899802399
Standard error in recall:  0.03658260123817905
Standard error in accuracy:  0.01908084527158395
Standard error in precision:  0.005634111960175172
Standard error in recall:  0.03476612836213352
Standard error in a

In [None]:
## use k fold cross validation to measure performance ##
scoring_metrics = ['accuracy', 'precision','recall']
dcScores = cross_validate(ens,X,y,cv=StratifiedKFold(10) ,scoring=scoring_metrics)
print('Mean Accuracy: %.2f' % np.mean (dcScores['test_accuracy']))
print('Mean Precision: %.2f' % np.mean(dcScores['test_precision']))
print('Mean Recall: %.2f' % np.mean(dcScores['test_recall']))

Mean Accuracy: 0.96
Mean Precision: 0.96
Mean Recall: 0.98


In [None]:
## import the scikit-learn model ##
from sklearn.ensemble import BaggingClassifier

In [None]:
ens = BaggingClassifier (base_estimator=DecisionTreeClassifier(class_weight='balanced' ),n_estimators=100)

In [None]:
## use k fold cross validation to measure performance ##
scoring_metrics = ['accuracy', 'precision', 'recall']
dcScores = cross_validate(ens,X,y,cv=StratifiedKFold(10),scoring=scoring_metrics)
print('Mean Accuracy: %.2f' % np.mean(dcScores['test_accuracy']))
print('Mean Precision: %.2f' % np.mean(dcScores['test_precision']))
print ('Mean Recall: %.2f' % np.mean(dcScores['test_recall']))



Mean Accuracy: 0.96
Mean Precision: 0.97
Mean Recall: 0.97
