# Adaboost implementation

Pseudocode
1) Initialization- a weight for each sample data point (same in the beginning)
2) Create a decision stump which maximizes information gain
3) Quantify the power of say for that decision stump
4) Calculate the new weights for each sample data point by using the power of say and erroneous predictions
5) Use new weights to go back to step 2 and create a decision stump by utilizing the weights of each data point when calculating the information gain as well as the power of say
6) After n iterations, when n different decision stumps are created, utilize the prediction power of each stump and reach an output.
    Starting from the first stump, multiple the power of say of that stump with its prediction output (binary class)
7) The final decision is based on the sign of the output of step 6 i.e if the sum is positive then its the positive class else negative class

In [1]:
# synthetic data sampling
import numpy as np
import pandas as pd
import math
from sklearn.datasets import make_classification
epsilon= 1e-10

# creating a decision stump

def calculate_impurity(node):
    # calculating probability of each class for impurity calculation
    weight_adjusted_values_pos= node[node['target']==0]['weights'].sum()
    weight_adjusted_values_neg= node[node['target']==1]['weights'].sum()
    # total weight in the node
    total_weight = node['weights'].sum()
    total_weight = max(total_weight, epsilon) 
    # calculating probability of each class based on weights
    probability_of_pos_class = weight_adjusted_values_pos / total_weight
    probability_of_neg_class = weight_adjusted_values_neg / total_weight
    

    # calculating the positive and negative entropy
    positive= -1 * ( probability_of_pos_class*math.log(probability_of_pos_class+ epsilon))
    negative= -1 * ( probability_of_neg_class*math.log(probability_of_neg_class+epsilon))
    return positive+negative

def stump_for_feature(dataframe, x):
    # calculate the impurity at the root node (before any split)
    impurity_at_root = calculate_impurity(dataframe)

    # get sorted values of feature x to create possible thresholds
    possible_values = sorted(set(dataframe[x]))
    # instead of taking actual points, we calculate midpoints between consecutive unique values for potential thresholds
    possible_thresholds = [(possible_values[i] + possible_values[i + 1]) / 2 for i in range(len(possible_values) - 1)]

    max_gain = -1 * math.inf
    max_threshold = None

    # Iterate over all possible thresholds to find out which value gives the most gain in information
    for threshold in possible_thresholds:
        # Split the data based on the threshold
        left_node = dataframe[dataframe[x] >= threshold]
        right_node = dataframe[dataframe[x] < threshold]

        # impurity for both resulting nodes
        a = calculate_impurity(left_node)
        b = calculate_impurity(right_node)

        # weighted impurity
        weighted_impurity = (left_node['weights'].sum() / dataframe['weights'].sum()) * a + (right_node['weights'].sum() / dataframe['weights'].sum()) * b


        information_gain = impurity_at_root - weighted_impurity

        # Check for max gain
        if information_gain > max_gain:
            max_gain = information_gain
            max_threshold = threshold

    return max_gain, max_threshold


def get_best_stump(data):
    stump_dict= {}
    for i in range(len(data.columns)):
        if data.columns[i]!="target":
            gain, threshold = stump_for_feature(data, data.columns[i])
            stump_dict[data.columns[i]]= (gain,threshold)
    max_feature, (max_gain, max_threshold) = max(stump_dict.items(), key=lambda item: item[1][0])
    return max_feature, max_gain, max_threshold


def calculate_error(data, feature, threshold):
    # misclassifications
    error=0
    indexes=[]
    data['new_target']= data[feature].apply(lambda x: 1 if x<= threshold else 0)
    
    data['classification_correctness'] = data.apply(lambda row: 0 if row['target'] != row['new_target'] else 1, axis=1)
    weighted_error = data.apply(lambda row: row['weights'] if row['target'] != row['new_target'] else 0, axis=1).sum()
    return weighted_error / data['weights'].sum(), data

    # return data['classification_correctness'].sum()/len(data), data

def calculate_amount_of_say(total_error):
    amount_of_say= 0.5*math.log((1/total_error+epsilon)-1)
    return amount_of_say

def weight_update(old_weight, say, sign):
    new_weight = old_weight * math.exp(sign * say)
    # print(f"Old weight: {old_weight}, Say: {say}, Sign: {sign}, New weight: {new_weight}")
    return new_weight

def reweight(power_of_say, data):
    data['weights']= data.apply(lambda x: weight_update(x['weights'], power_of_say, -1) if x['classification_correctness']==0 else weight_update(x['weights'], power_of_say, 1), axis=1)
    data['weights'] = data['weights'] / data['weights'].sum()
    return data

# Final prediction for each sample in the dataset
def predict(data, individual_learners):
    final_predictions = [0] * len(data)
    for feature, threshold, alpha in individual_learners:
        # Make predictions for current stump
        predictions = data[feature].apply(lambda x: 1 if x >= threshold else -1)
        # Add weighted prediction to final prediction
        final_predictions = [fp + alpha * pred for fp, pred in zip(final_predictions, predictions)]
    # The final prediction is based on the sign of the aggregated score
    return [1 if fp > 0 else 0 for fp in final_predictions]

        

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, load_iris, load_wine
from sklearn.preprocessing import StandardScaler
import pandas as pd
import math

# Function to test the implementation with a given dataset
def test_adaboost(data, target):
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42)
    
    # Prepare training DataFrame
    train_df = pd.DataFrame(X_train, columns=[f'feature_{i+1}' for i in range(X_train.shape[1])])
    train_df['target'] = y_train
    train_df['weights'] = 1 / len(train_df)
    
    # AdaBoost Training Process
    individual_learners = []
    for i in range(10):
        max_feature, max_gain, max_threshold = get_best_stump(train_df)
        total_error, train_df = calculate_error(train_df, max_feature, max_threshold)
        amount_of_say = calculate_amount_of_say(total_error)
        train_df = reweight(amount_of_say, train_df)
        individual_learners.append((max_feature, max_threshold, amount_of_say))
        print(f'Iteration {i} done')
    
    # Prepare test DataFrame
    test_df = pd.DataFrame(X_test, columns=[f'feature_{i+1}' for i in range(X_test.shape[1])])
    test_df['target'] = y_test
    
    # Make predictions on the test set
    test_df['predictions'] = predict(test_df, individual_learners)
    
    # Measure accuracy on the test set
    accuracy = (test_df['predictions'] == test_df['target']).mean()
    print(f"Test Accuracy for this dataset: {accuracy * 100:.2f}%")

# Test with synthetic classification dataset
X, y = make_classification(n_samples=300, n_features=10, n_classes=2, random_state=42)
test_adaboost(X, y)


Iteration 0 done
Iteration 1 done
Iteration 2 done
Iteration 3 done
Iteration 4 done
Iteration 5 done
Iteration 6 done
Iteration 7 done
Iteration 8 done
Iteration 9 done
Test Accuracy for this dataset: 11.11%


In [None]:

# Test with Iris dataset (consider only binary classification)
iris = load_iris()
iris_data = iris.data[iris.target != 2]
iris_target = iris.target[iris.target != 2]
test_adaboost(iris_data, iris_target)

# Test with Wine dataset (use only binary classification for simplicity)
wine = load_wine()
wine_data = wine.data[wine.target != 2]
wine_target = wine.target[wine.target != 2]
test_adaboost(wine_data, wine_target)
