In [134]:
import numpy as np

In [135]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
from sklearn.metrics import confusion_matrix

In [136]:
class MajorityVoteClassifier(BaseEstimator):
    """A majority vote ensemble classifier
    
    Parameters
    ----------
    classifiers : array-like, shape = [n_classifiers]
        Different classifiers of the ensemble
        
    vote : str, ['classlabel', 'probability']
        Default: 'classlabel'
        
    weights : array-like, shape = [n_classifiers]
        Optional, default: None
    """
    def __init__(self, classifiers, vote='classlabel', weights=None):
        for clf in classifiers:
            clf = clone(clf)
        self.classifiers = classifiers
        self.vote = vote
        self.weights = weights
        
    def fit(self, X, y):
        """Fit classifiers
        
        Parameters
        ----------
        X : shape = [n_samples, n_features]
        y : shape = [n_samples, 1]
        """
        for clf in self.classifiers:
            clf.fit(X, y)
            
    def predict(self, X):
        preds = []
        for clf in self.classifiers:
            preds.append(clf.predict(X))
        self.preds = np.array(preds).T
        
        def find_mode(arr):
            return np.argmax(np.bincount(arr))
        return np.apply_along_axis(find_mode, 1, self.preds)

In [137]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

In [138]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.5, stratify=y)

In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [140]:
pipe1 = Pipeline([('sc', StandardScaler()),
                  ('lr', LogisticRegression())])
clf2 = DecisionTreeClassifier()
pipe3 = Pipeline([('sc', StandardScaler()),
                  ('knn', KNeighborsClassifier())])
my_ensemble = MajorityVoteClassifier([pipe1, clf2, pipe3])

In [141]:
scores = list()
scores_std = list()
for clf in [pipe1, clf2, pipe3, my_ensemble]:
    score = cross_val_score(clf, X_train, y_train,
                            cv=10, scoring='accuracy')
    scores.append(np.mean(score))
    scores_std.append(np.std(score))

In [142]:
for idx, name in enumerate(['logistic', 'tree', 'knn', 'ensemble']):
    print('{} accuracy: {:.2f} +/- {:.2f}'
              .format(name, scores[idx], scores_std[idx]))

logistic accuracy: 0.93 +/- 0.15
tree accuracy: 0.89 +/- 0.16
knn accuracy: 0.89 +/- 0.16
ensemble accuracy: 0.94 +/- 0.13


In [143]:
my_ensemble.fit(X_train, y_train)
confusion_matrix(y_test, my_ensemble.predict(X_test))

array([[23,  2],
       [ 2, 23]])

In [144]:
confusion_matrix(y_test, clf2.predict(X_test))

array([[23,  2],
       [ 2, 23]])

In [145]:
my_ensemble.preds[:, 2] == pipe3.predict(X_test)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True], dtype=bool)

# Bagging

In [146]:
import pandas as pd
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
                      header=None)

In [147]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols', 
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

In [149]:
df_wine = df_wine[df_wine['Class label']  != 1]
X = df_wine[['Alcohol',
             'OD280/OD315 of diluted wines']].values
y = df_wine['Class label'].values

In [150]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [154]:
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=1,
                     stratify=y)

In [155]:
from sklearn.ensemble import BaggingClassifier
tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=1, max_depth=None)

In [157]:
bag = BaggingClassifier(base_estimator=tree, n_estimators=500)

In [159]:
from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
tree_train = accuracy_score(y_train, tree.predict(X_train))
tree_test = accuracy_score(y_test, tree.predict(X_test))
print('Decision tree train/test accuracies {:.3f} / {:.3f}'
      .format(tree_train, tree_test))

Decision tree train/test accuracies 1.000 / 0.833


In [162]:
bag.fit(X_train, y_train)
bag_train = accuracy_score(y_train, bag.predict(X_train))
bag_test = accuracy_score(y_test, bag.predict(X_test))
print('Bag train/test accuracy: {:.3f}/{:.3f}'
      .format(bag_train, bag_test))

Bag train/test accuracy: 1.000/0.917


# Adaptive Boosting (attacking bias)