In [7]:
import numpy as np 
import math 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [8]:
class DecisionStump:
    def __init__(self):
        self.polarity = 1 # sample should be classified as 1 or -1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column <= self.threshold] = -1 
        else:
            predictions[X_column > self.threshold] = -1

        return predictions


class AdaBoost:
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # initial weights
        w = np.full(n_samples, (1/n_samples)) 
        self.clfs = []
        for _ in range(self.n_clf):
            clf = DecisionStump()

            min_error = float('inf')
            for fi in range(n_features):
                X_column = X[:, fi]
                thresholds = np.unique(X_column)

                for t in thresholds:
                    polarity = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column <= t] = -1
                    
                    misclassified = w[y != predictions]
                    error = sum(misclassified)
                    
                    if error > 0.5:
                        error = 1 - error
                        polarity = -1
                    
                    if error < min_error:
                        min_error = error 
                        clf.min_error = min_error
                        clf.polarity = polarity
                        clf.threshold = t 
                        clf.feature_idx = fi
                        
            clf.alpha = 0.5 * np.log((1-error)/(error + 1e-10))
            predictions = clf.predict(X)
            w *= np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w)

            self.clfs.append(clf)
        
    def predict(self, X):
        preds = np.sum([c.alpha * c.predict(X) for c in self.clfs], axis=0)
        signed_preds = np.sign(preds)
        for c in self.clfs:
            print(c.min_error)
        return signed_preds


In [9]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = datasets.load_breast_cancer()
X, y = data.data, data.target 
y[y == 0] = -1


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

model = AdaBoost(100)
model.fit(X_train, y_train)
y_preds = model.predict(X_val)
print("Accuracy score", accuracy_score(y_val, y_preds))
print(model.clfs)

import xgboost
# model = xgboost.XGBClassifier()
model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_preds = model.predict(X_val)
print("XGBoost Accuracy score", accuracy_score(y_val, y_preds))


NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.