# Import packages

In [1]:
import numpy as np
from sklearn.datasets import load_iris, load_boston, load_digits, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from itertools import compress
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Adaboost from scratch

Let's implement Adaboost SAMME regarding R.Schapire paper.

We will try to comment the code step by step, so the reader can learn both concept and code.

In [2]:
class AdaBoost_SAMME():
    """ Homemade AdaBoost SAMME classifier class """
   
    alpha = []
    H = []
    
    def __init__(self, base_estimator, n_estimators, K):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.K = K


    def fit(self, X, y):
        n = len(X)
        weights = np.ones(n)/n
        self.alpha = []
        self.H = []
        
        for m in range(0, self.n_estimators):
            
            # fit base estimator on the weighted distribution
            h_m = self.base_estimator.fit(X, y, sample_weight=weights)
            h_m_preds = h_m.predict(X)
            
            # List of weighted classifiers
            self.H.append(clone(h_m).fit(X, y, sample_weight=weights))
            
            # Calculate the weighted error
            err = np.isclose(y, h_m_preds)
            indicatrice_error = [not c for c in err]            
            error_m = np.sum(list(compress(weights, indicatrice_error))) / np.sum(weights)
            
            # Calculate alpha_m -> the less error the bigger alpha
            alpha_m = np.log((1 - error_m) / error_m) + np.log(self.K - 1)
            self.alpha.append(alpha_m)
                         
            # Update the weights for the next iteration : the mispredicted observations are given a bigger weight
            indicatrice_vector = np.multiply(indicatrice_error, 1)
            weights = np.multiply(weights, np.exp(alpha_m * indicatrice_vector))
            
            # Renormalize the weights
            weights = weights / np.sum(weights)       
        
        return 
    

    def predict(self, X_new):
        y_new = []        
        new_preds = []
        
        # Predict the output of the new X given the M estimators previously fited 
        for m in range(0, len(self.H)):
            new_preds.append(self.H[m].predict(X_new))
        
        # Compute the matrix of weighted class predictions
        final_pred = []
        for i in range(0, len(X_new)):                        
            class_pred = []
            for k in range(0, self.K):              
                somme = 0
                for m in range(0, len(self.H)):                    
                        if new_preds[m][i]==k:
                            somme += self.alpha[m]                
                class_pred.append(somme)
            final_pred.append(class_pred)
        
        # Select the max weighted class prediction
        for i in range(0, len(final_pred)):
            y_new.append(np.argmax(final_pred[i]))
            
        return y_new

# Comparaison of our homemade Adaboost to sklearn Adaboost

In [3]:
# Loading the MNIST data
digits = load_digits()
X_digits = digits['data']
y_digits = digits['target']

In [4]:
# Building train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, test_size=0.25)

# Fit and predict of homemade Adaboost
ada_homemade = AdaBoost_SAMME(base_estimator=DecisionTreeClassifier(max_depth=5),
                              n_estimators=50, K=len(digits.target_names))
ada_homemade.fit(X_train, y_train)
print("Accuracy score of our homemade Adaboost classifier : ", accuracy_score(y_test, ada_homemade.predict(X_test)))

Accuracy score of our homemade Adaboost classifier :  0.98


In [5]:
# Fit and predict of sklearn Adaboost 
ada_sklearn = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5)
                                  , n_estimators=50, algorithm='SAMME') 
ada_sklearn.fit(X_train, y_train)
print("Accuracy score of sklearn Adaboost classifier : ", accuracy_score(y_test, ada_sklearn.predict(X_test)))

Accuracy score of sklearn Adaboost classifier :  0.9666666666666667
