In [12]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

data = datasets.load_breast_cancer()
X = data.data
y = data.target
y[y==0] = -1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=151)

In [16]:

class DecisionStump:
    
    def __init__(self):
        self.polarity = 1 # this tells us if a sample should be signed as +1 or -1 for a given threshold
        self.feature_index = None
        self.threshold = None
        self.alpha = None
    
    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_index]
        
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1
        return predictions
    
    
class Adaboost:
    
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # init the weights
        w = np.full(n_samples, (1/n_samples)) # creates a vector of 1/n_samples of length n_samples
        
        self.clfs = []
        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error= float('inf')
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)
                for threshold in thresholds:
                    p = 1 # polarity
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    
                    # get error over mis-classified samples
                    missclassified = w[y != predictions]
                    error = sum(missclassified)
                    if error > 0.5:
                        error = 1-error
                        p = -1
                    
                    
                    # On the polarity parameter p:
                    # The amount of say (alpha) will be negative when the sample is correctly classified.
                    # The amount of say (alpha) will be positive when the sample is miss-classified.
                    # The rationale behind is:
                    # If it is a strong stump: we will decrease the weight for correctly-labels samples,
                    # and will increase the weight for incorrectly-labels samples more
                    # If it is a weak stump: we will decrease the weight for correctly-labels samples,
                    # and will increase the weight for incorrectly-labels samples less

                    if error < min_error:
                        min_error = error
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_index = feature_i
                        
            eps = 1e-10 # to avoid over/under flow
            clf.alpha = 0.5 * np.log(1-error/(error+eps)) # range in [0, 1] where 0 for perfect stump and 1 horrible stump
            
            predictions = clf.predict(X)
            
            w *= np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w) # normalize new weights to sum to 1
            
            self.clfs.append(clf)
            
    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs ]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)
        return y_pred

In [17]:
clf = Adaboost(n_clf = 5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(np.mean(y_test == y_pred))

0.14035087719298245
