In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [7]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [34]:
X = data.data
y = data.target

y = np.where(y == 0, -1, 1)
X,y

(array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
len(X_train) , len(y_train)

(455, 455)

In [17]:
class DecisionStump:
  def __init__(self):
    self.polarity = 1  #to decide the direction for classification (Greater than or less than)
    self.feature_index=None
    self.threshold = None
    self.alpha = None #Weight of this weak classifier

  def predict(self,X):
    n_samples = X.shape[0]
    X_column = X[:, self.feature_idx]
    predictions = np.ones(n_samples)
    if self.polarity == 1:
      predictions[X_column < self.threshold] = -1
    else:
      predictions[X_column > self.threshold] = -1
    return predictions

For each threshold in the feature values, we initially set p = 1. This setting means that:
If a sample’s feature value is less than the threshold, we classify it as -1.
If a sample’s feature value is greater than or equal to the threshold, we classify it as +1.

If the error is found to be greater than 50%, the code flips polarity by setting p = -1, meaning that samples above the threshold are assigned -1 and those below the threshold are assigned +1

In [35]:
class AdaBoost:
  def __init__(self, n_clf=5):
    self.n_clf = n_clf
    self.clfs = []

  def fit(self,X,y):
    n_samples, n_features = X.shape

    #Inital all weights will be 1/N
    w=np.full(n_samples, (1/n_samples))
    self.clfs= []

    for _ in range(self.n_clf):   #for each classifier (DecisionStump)
      clf = DecisionStump()
      min_error = float("inf")

      for ith_feature in range(n_features):
        thresholds = np.unique(X[:,ith_feature])

        for threshold in thresholds:
          #predict with polarity 1
          p=1
          predictions = np.ones(n_samples)
          predictions[X[:,ith_feature] < threshold] = -1

          misclassified = w[predictions!=y]
          error = sum(misclassified)

          if error > 0.5:
            error = 1 - error
            p = -1

          if error < min_error:
            clf.polarity = p
            clf.threshold = threshold
            clf.feature_idx = ith_feature
            min_error = error


    epsilon = 1e-10
    # If a valid min_error was found, calculate alpha
    if min_error < float("inf"):
      clf.alpha = 0.5 * np.log((1.0 - min_error + epsilon) / (min_error + epsilon))
    else:
      raise ValueError("no threshold for error below infinity.")


    predictions = clf.predict(X)

    w *= np.exp(-clf.alpha * y * predictions)
    # Normalize weights
    w /= np.sum(w)
    self.clfs.append(clf)

  def predict(self, X):
      #Gives combined weighted classifier's predicitons
      clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
      y_pred = np.sum(clf_preds, axis=0)
      y_pred = np.sign(y_pred)

      return y_pred





In [36]:
from sklearn.model_selection import KFold
def cross_validate(model, X, y, k=5):
    # Initialize KFold with k splits
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []

    # Perform k-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Initialize and train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate accuracy and append to the list
        acc = accuracy(y_test, y_pred)
        accuracies.append(acc)
        print(f"Fold Accuracy: {acc}")

    # Calculate the average accuracy over all folds
    avg_accuracy = np.mean(accuracies)
    print(f"Average Cross-Validation Accuracy: {avg_accuracy}")
    return avg_accuracy

In [37]:
# Testing
if __name__ == "__main__":
    # Imports
    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    data = datasets.load_breast_cancer()
    X, y = data.data, data.target

    y[y == 0] = -1

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=5
    )

    # Adaboost classification with 5 weak classifiers
    clf = AdaBoost(n_clf=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy(y_test, y_pred)
    print("Accuracy:", acc)

    cross_validate(clf, X, y, k=5)

Accuracy: 0.9122807017543859
Fold Accuracy: 0.8947368421052632
Fold Accuracy: 0.9035087719298246
Fold Accuracy: 0.8596491228070176
Fold Accuracy: 0.9210526315789473
Fold Accuracy: 0.8584070796460177
Average Cross-Validation Accuracy: 0.8874708896134141


In [31]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

clf_sk = AdaBoostClassifier(n_estimators=20)
clf_sk.fit(X_train, y_train)
y_pred_sk = clf_sk.predict(X_test)
print("Sklearn AdaBoost Accuracy:", accuracy_score(y_test, y_pred_sk))
cross_validate(clf, X, y, k=5)



Sklearn AdaBoost Accuracy: 0.9649122807017544
Fold Accuracy: 0.11403508771929824
Fold Accuracy: 0.09649122807017543
Fold Accuracy: 0.14035087719298245
Fold Accuracy: 0.07894736842105263
Fold Accuracy: 0.13274336283185842
Average Cross-Validation Accuracy: 0.11251358484707344


0.11251358484707344