In [10]:
import pandas as pd
import numpy as np

# Compute error rate, alpha and w
def compute_error(y, y_pred, w_i):
    '''
    Calculate the error rate of a weak classifier m. Arguments:
    y: actual target value
    y_pred: predicted value by weak classifier
    w_i: individual weights for each observation
    
    Note that all arrays should be the same length
    '''
    return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)

def compute_alpha(error):
    '''
    Calculate the weight of a weak classifier m in the majority vote of the final classifier. This is called
    alpha in chapter 10.1 of The Elements of Statistical Learning. Arguments:
    error: error rate from weak classifier m
    '''
    return np.log((1 - error) / error)

def update_weights(w_i, alpha, y, y_pred):
    ''' 
    Update individual weights w_i after a boosting iteration. Arguments:
    w_i: individual weights for each observation
    y: actual target value
    y_pred: predicted value by weak classifier  
    alpha: weight of weak classifier used to estimate y_pred
    '''  
    return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))

In [11]:
import sys
sys.path.append("../Decision Tree")
from TreeNode import TreeNode
from DecisionTree import ID3

In [27]:
class DecisionTreeClassifier:

    def __init__(self, max_depth=1):
        self.max_depth = max_depth
        self.feature_index = None
        self.threshold = None
        self.left_label = None
        self.right_label = None

    def fit(self, X, y, sample_weight=None):
        best_gini = float('inf')
        n_samples, n_features = X.shape
        if sample_weight is None:
            sample_weight = np.ones(n_samples) / n_samples

        for feature_i in range(n_features):
            thresholds, classes = zip(*sorted(zip(X[:, feature_i], y)))

            left_weighted_sum = 0
            right_weighted_sum = sum(sample_weight)

            left_count = 0
            right_count = n_samples

            for i in range(1, n_samples):
                weight, c = sample_weight[i - 1], classes[i - 1]
                left_weighted_sum += weight * c
                right_weighted_sum -= weight * c

                left_count += 1
                right_count -= 1

                gini_left = 1.0 - sum([(left_weighted_sum / left_count) ** 2])
                gini_right = 1.0 - sum([(right_weighted_sum / right_count) ** 2])

                weighted_gini = (left_count * gini_left + right_count * gini_right) / n_samples

                if thresholds[i] == thresholds[i - 1]:
                    continue

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    self.feature_index = feature_i
                    self.threshold = (thresholds[i] + thresholds[i - 1]) / 2.0
                    self.left_label = 1 if left_weighted_sum > (left_count - left_weighted_sum) else 0
                    self.right_label = 1 if right_weighted_sum > (right_count - right_weighted_sum) else 0

    def predict(self, X):
        return np.where(X[:, self.feature_index] <= self.threshold, self.left_label, self.right_label)

In [45]:
# Define AdaBoost class
class AdaBoost:
    
    def __init__(self):
        self.alphas = []
        self.G_M = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        '''
        Fit model. Arguments:
        X: independent variables - array-like matrix
        y: target variable - array-like vector
        M: number of boosting rounds. Default is 100 - integer
        '''
        
        # Clear before calling
        self.alphas = [] 
        self.training_errors = []
        self.M = M

        # Iterate over M weak classifiers
        for m in range(0, M):
            
            # Set weights for current boosting iteration
            if m == 0:
                w_i = np.ones(len(y)) * 1 / len(y)  # At m = 0, weights are all the same and equal to 1 / N
            else:
                # (d) Update w_i
                w_i = update_weights(w_i, alpha_m, y, y_pred)
            
            # (a) Fit weak classifier and predict labels
            G_m = DecisionTreeClassifier(max_depth = 1)     # Stump: Two terminal-node classification tree
            G_m.fit(X, y, sample_weight = w_i)
            y_pred = G_m.predict(X)
            
            self.G_M.append(G_m) # Save to list of weak classifiers

            # (b) Compute error
            error_m = compute_error(y, y_pred, w_i)
            self.training_errors.append(error_m)

            # (c) Compute alpha
            alpha_m = compute_alpha(error_m)
            self.alphas.append(alpha_m)

        assert len(self.G_M) == len(self.alphas)

In [46]:
def predict(self, X):
    '''
    Predict using fitted model. Arguments:
    X: independent variables - array-like
    '''

    # Initialise dataframe with weak predictions for each observation
    weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M)) 

    # Predict class label for each weak classifier, weighted by alpha_m
    for m in range(self.M):
        y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
        weak_preds.iloc[:,m] = y_pred_m

    # Calculate final predictions
    y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

    return y_pred

In [1]:
def preprocess_data(df):
    # Convert continuous attributes to binary
    for column in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']:
        median = df[column].median()
        df[column] = df[column].apply(lambda x: 1 if x > median else 0)
    
    # Note: For columns with "unknown", we'll leave them as is. Pandas will treat them as a separate category.
    
    return df

# Load the training and test data
test_file_path = "Data/bank-4/test.csv"
train_file_path = "Data/bank-4/train.csv"
column_names = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df_bank_train = pd.read_csv(train_file_path, names=column_names)
df_bank_test = pd.read_csv(test_file_path, names=column_names)
bank_attributes = df_bank_train.columns.tolist()[:-1]

# Apply preprocessing to train and test datasets
train_data = preprocess_data(df_bank_train)
test_data = preprocess_data(df_bank_test)
attributes = bank_attributes
# Preview the preprocessed train data
print(train_data.head())

NameError: name 'pd' is not defined

In [None]:
# Load the training and test data
test_file_path = "Data/bank-4/test.csv"
train_file_path = "Data/bank-4/train.csv"
column_names = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df_bank_train = pd.read_csv(train_file_path, names=column_names)
df_bank_test = pd.read_csv(test_file_path, names=column_names)

# Apply preprocessing to train and test datasets
train_data = preprocess_data(df_bank_train)
test_data = preprocess_data(df_bank_test)

# Separate features and labels for train and test data
X_train = train_data.drop('y', axis=1)
y_train = train_data['y']
X_test = test_data.drop('y', axis=1)
y_test = test_data['y']

# Initialize AdaBoost
ab = AdaBoost()

# Fit the AdaBoost model with training data
ab.fit(X_train, y_train, M=100)

# (Optional) Predict on the test data and evaluate the performance
# predictions = ...
# performance = ...

# Preview the preprocessed train data
print(train_data.head())


InvalidIndexError: (slice(None, None, None), 0)

In [44]:
# Fit model
ab = AdaBoost()


X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

ab.fit(X, y, M = 400)

# Predict on test set
y_pred = ab.predict(test_data.iloc[:,-1])

AttributeError: 'numpy.ndarray' object has no attribute 'columns'