In [2]:
# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Display elapsed time on Jupyter.
%reload_ext autotime

time: 75 µs (started: 2023-10-23 12:07:20 +10:30)


In [None]:
# Question 1

def Adaboost_train(train_data, train_label, T):
# train_data: N x d matrix
# train_label: N x 1 vector
# T: the number of weak classifiers in the ensemble
    ensemble_models = []
    for t in range(0,T):
        model_param_t = weak_classifier_train(train_data, train_label) # model_param_t returns the model parameters of the learned weak classifier
        # definition of model
        ensemble_models.append(model_param_t)
    return ensemble_models

#### Potential Error

1. No weight initialization: 

The AdaBoost algorithm first needs to initialize weights for each data point. These weights are used to train the weak classifier and are updated at each iteration. However, the code does not initialize or update these weights.

2. No Weighted Error Calculation: 

After training a weak classifier, AdaBoost calculates the weighted error for that classifier. This is the sum of the weights of the misclassified examples. This step is missing from the code.

3. No Alpha Calculation: 

AdaBoost calculates a weighted error alpha for each weak classifier, which is used to update the weights of the data points. alpha value also determines the weak classifier's contribution to the final set. This step is missing from the code.

4. No weight update: 

After calculating the alpha value, AdaBoost updates the weights of the data points. The weights of the correctly categorized examples are decreased, while the weights of the incorrectly categorized examples are increased. This step is missing from the code.

5. No final model output: AdaBoost merges weak classifiers into strong ones. The contribution of each weak classifier is determined by its alpha value. However, the code only returns the parameters of the weak classifiers and does not combine them into a final model.

#### weak_classifier_train:

- Input: 

The function should take in the training data, training labels, and the weights of the data points.

- Output: 

It should return the model parameters of the learned weak classifier and possibly other relevant information. It should includes feature, threshold and polarity.

In [None]:
## Try to correct Adaboost_train

def Adaboost_train(train_data, train_label, T):
    N, _ = train_data.shape
    weights = np.ones(N) / N  # Initialize weights
    ensemble_models = []
    alphas = []

    for t in range(T):
        model_param_t = weak_classifier_train(train_data, train_label, weights)
        predictions = weak_classifier_predict(train_data, model_param_t)
        
        # Calculate weighted error
        error = np.sum(weights[predictions != train_label])
        
        # Calculate alpha
        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)
        
        # Update weights
        weights[predictions == train_label] *= np.exp(-alpha)
        weights[predictions != train_label] *= np.exp(alpha)
        
        # Normalize weights
        weights /= np.sum(weights)
        
        ensemble_models.append(model_param_t)

    return ensemble_models, alphas


In [3]:
## Implement Discrete AdaBoost
import numpy as np

def weak_classifier_train(X, y, weights):
    """Train a decision stump (weak classifier) using weighted data."""
    n, d = X.shape
    best_error = float('inf')
    best_feature = None
    best_threshold = None
    best_polarity = 1

    for feature in range(d):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            predictions = np.where(X[:, feature] < threshold, 1, -1)
            error = np.sum(weights[predictions != y])

            if error < best_error:
                best_error = error
                best_feature = feature
                best_threshold = threshold

            # Check the other polarity
            predictions = np.where(X[:, feature] > threshold, 1, -1)
            error = np.sum(weights[predictions != y])

            if error < best_error:
                best_error = error
                best_feature = feature
                best_threshold = threshold
                best_polarity = -1

    return best_feature, best_threshold, best_polarity

def weak_classifier_predict(X, feature, threshold, polarity):
    """Make predictions using a decision stump."""
    if polarity == 1:
        return np.where(X[:, feature] < threshold, 1, -1)
    else:
        return np.where(X[:, feature] > threshold, 1, -1)

def adaboost_train(X, y, T):
    n, _ = X.shape
    weights = np.full(n, 1/n)
    alphas = []
    classifiers = []

    for t in range(T):
        # Train a weak classifier
        feature, threshold, polarity = weak_classifier_train(X, y, weights)
        predictions = weak_classifier_predict(X, feature, threshold, polarity)

        # Calculate the error of the weak classifier
        error = np.sum(weights[predictions != y])

        # Compute alpha
        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)
        classifiers.append((feature, threshold, polarity))

        # Update weights
        weights[predictions == y] *= np.exp(-alpha)
        weights[predictions != y] *= np.exp(alpha)

        # Normalize weights
        weights /= np.sum(weights)

    return alphas, classifiers

def adaboost_predict(X, alphas, classifiers):
    """Predict using the ensemble of weak classifiers."""
    n, _ = X.shape
    predictions = np.zeros(n)

    for alpha, (feature, threshold, polarity) in zip(alphas, classifiers):
        predictions += alpha * weak_classifier_predict(X, feature, threshold, polarity)

    return np.sign(predictions)


time: 1.83 ms (started: 2023-10-23 12:07:26 +10:30)


In [9]:
## Verify the adaboost

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Load the Spambase dataset with parser set to 'auto'
data = fetch_openml('spambase', version=1, parser='auto')
X, y = data['data'], data['target']
y = np.where(y == '1', 1, -1)  # Convert labels to 1 and -1

# If the data is in pandas DataFrame format, convert it to numpy arrays
if hasattr(X, 'values'):
    X = X.values
if hasattr(y, 'values'):
    y = y.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the custom AdaBoost implementation
alphas, classifiers = adaboost_train(X_train, y_train, T=50)
custom_predictions = adaboost_predict(X_test, alphas, classifiers)

# Train the sklearn AdaBoost implementation
clf = AdaBoostClassifier(n_estimators=50, random_state=42)
clf.fit(X_train, y_train)
sklearn_predictions = clf.predict(X_test)

# Compare the accuracy of both implementations
custom_accuracy = accuracy_score(y_test, custom_predictions)
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)

print(f"Custom AdaBoost Accuracy: {custom_accuracy * 100:.2f}%")
print(f"Sklearn AdaBoost Accuracy: {sklearn_accuracy * 100:.2f}%")


Custom AdaBoost Accuracy: 94.68%
Sklearn AdaBoost Accuracy: 93.59%
time: 27.4 s (started: 2023-10-23 13:37:41 +10:30)
