Lab 3
1. Ensemble methods
    - Bagging
    - Boosting
    - Random Forests
2. Hyperparameter Tuning
3. Final System

In [25]:
from random import seed
from random import randrange
from math import sqrt
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv("output.csv")
df.head()

Unnamed: 0,systolic,eyesight(left),hearing(right),ALT,relaxation,Cholesterol,AST,hearing(left),smoking,serum creatinine,Gtp,serum creatinine^2,Gtp^2
0,0.981702,-1.257856,1,-0.087326,1.125777,-0.837985,-0.37157,1,1,0.597927,-0.295342,0.357517,0.087227
1,1.845852,-1.009169,2,-0.199983,0.681066,-0.063252,0.1567,2,0,1.155511,0.025124,1.335205,0.000631
2,-0.353802,-1.506543,1,0.250645,-0.208355,-0.626695,0.1567,1,1,-0.517239,0.53787,0.267536,0.289304
3,0.667465,1.229017,1,0.025331,1.236955,-0.556264,-0.582878,1,0,0.597927,-0.199202,0.357517,0.039681
4,-0.118125,1.229017,1,-0.763267,-0.097177,-1.436643,-0.688532,1,1,-0.517239,-0.615808,0.267536,0.379219


In [27]:
X = df.drop('smoking', axis=1)
y = df['smoking']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(127404, 12)
(31852, 12)


# **1. Bagging (Bootstrap Aggregating)**

Resources: [Bagging](https://insidelearningmachines.com/build-a-bagging-classifier-in-python/)

Bootstrapping is a statistical method to create sample data without leaving the properties of the actual dataset. The individual samples of data called bootstrap samples.

## Steps:

1. Produce N bootstrap samples on the training data
2. Loop through each of the i = 1 -> N bootstrap samples:
  - Fit a model to sample i
  - Produce the desired predictions with this model.
  - Repeat the above two steps, storing the trained models and predictions
4. Aggregate the predictions. In the event of having a labelled test set, compare these results with the test dataset labels
5. If the results are good, we can deploy our trained ensemble. Input data is provided to the ensemble, and each constituent model produces predictions. These predictions are aggregated to yield a final result.

## Hyperparameter

In [29]:
n_estimators = 10  # Number of bagging iterations

In [30]:
class Bagging():
    '''Bagging Classifier from Scratch.

    Parameters
    ----------
    n_estimators : int
        number of bagging iterations
    '''

    def __init__(self, n_estimators):
        self.n_elements = n_estimators
        self.models = []

    # Private function to make bootstrap samples
    def __make_bootstraps(self, data):
        # Initialize output dictionary & unique value count
        dc = {}
        unip = 0
        # Get sample size
        b_size = data.shape[0]
        # Get list of row indexes
        idx = [i for i in range(b_size)]
        # Loop through the required number of bootstraps
        for b in range(self.n_elements):
            # Obtain bootstrap samples with replacement
            sidx = np.random.choice(idx, replace=True, size=b_size)
            b_samp = data[sidx, :]
            # Compute number of unique values contained in the bootstrap sample
            unip += len(set(sidx))
            # Obtain out-of-bag samples for the current bootstrap
            oidx = list(set(idx) - set(sidx))
            o_samp = np.array([])
            if oidx:
                o_samp = data[oidx, :]
            # Store results
            dc['boot_' + str(b)] = {'boot': b_samp, 'test': o_samp}
        # Return the bootstrap results
        return dc

    # Train the ensemble
    def fit(self, X_train, y_train, print_metrics=False):
      # Convert y_train to a NumPy array
      training_data = np.concatenate((X_train, np.array(y_train).reshape(-1, 1)), axis=1)
      # Make bootstrap samples
      dcBoot = self.__make_bootstraps(training_data)
      # Initialize metric arrays
      accs = np.array([])
      pres = np.array([])
      recs = np.array([])
      # Iterate through each bootstrap sample & fit a model
      cls = DecisionTreeClassifier(class_weight='balanced')
      for b in dcBoot:
          # Make a clone of the model
          model = clone(cls)
          # Fit a decision tree classifier to the current sample
          model.fit(dcBoot[b]['boot'][:, :-1], dcBoot[b]['boot'][:, -1].reshape(-1, 1))
          # Append the fitted model
          self.models.append(model)
          # Compute the predictions on the out-of-bag test set & compute metrics
          if dcBoot[b]['test'].size:
              yp = model.predict(dcBoot[b]['test'][:, :-1])
              acc = accuracy_score(dcBoot[b]['test'][:, -1], yp)
              pre = precision_score(dcBoot[b]['test'][:, -1], yp)
              rec = recall_score(dcBoot[b]['test'][:, -1], yp)
              # Store the error metrics
              accs = np.concatenate((accs, [acc]))
              pres = np.concatenate((pres, [pre]))
              recs = np.concatenate((recs, [rec]))

    # Predict from the ensemble
    def predict(self, X):
        # Check we've fit the ensemble
        if not self.models:
            print('You must train the ensemble before making predictions!')
            return None
        # Loop through each fitted model
        predictions = []
        for m in self.models:
            # Make predictions on the input X
            yp = m.predict(X)
            # Append predictions to storage list
            predictions.append(yp.reshape(-1, 1))
        # Compute the ensemble prediction
        ypred = np.round(np.mean(np.concatenate(predictions, axis=1), axis=1)).astype(int)
        # Return the prediction
        return ypred

## Test

In [31]:
# Convert X_train and X_test to NumPy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

# Train the Bagging model
bagging = Bagging(n_estimators=10)
bagging.fit(X_train, y_train)

# Make predictions on the test set
y_pred = bagging.predict(X_test)

# Compute accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Bagging Accuracy: {(accuracy*100):.4f}%")
print(f"Bagging Precision: {(precision*100):.4f}%")
print(f"Bagging Recall: {(recall*100):.4f}%")

Bagging Accuracy: 68.7084%
Bagging Precision: 68.5250%
Bagging Recall: 68.7084%


# **2. Boosting**

Resources: [Boosting](https://randomrealizations.com/posts/gradient-boosting-multi-class-classification-from-scratch/)

## Hyperparameters

In [32]:
n_estimators = 10  # Number of boosting iterations
alpha = 0.01  # learning rate
max_depth = 3 # maximum tree depth

In [40]:
class Boosting():
    '''Gradient Boosting Classifier from Scratch.

    Parameters
    ----------
    n_estimators : int
        number of boosting iterations

    learning_rate : float
        learning rate hyperparameter

    max_depth : int
        maximum tree depth
    '''

    def __init__(self, n_estimators, learning_rate, max_depth):
        self.n_estimators=n_estimators;
        self.learning_rate=learning_rate
        self.max_depth=max_depth;

    def fit(self, X, y):
        '''Fit the GBM

        Parameters
        ----------
        X : ndarray of size (number observations, number features)
            design matrix

        y : ndarray of size (number observations,)
            integer-encoded target labels in {0,1,...,k-1}
        '''

        # Flatten y if it is 2D
        if len(y.shape) > 1:
            y = y.ravel()  # Flatten to 1D

        self.n_classes = pd.Series(y).nunique()
        y_ohe = self._one_hot_encode_labels(y)

        raw_predictions = np.zeros(shape=y_ohe.shape)
        probabilities = self._softmax(raw_predictions)
        self.boosters = []
        for m in range(self.n_estimators):
            class_trees = []
            for k in range(self.n_classes):
                negative_gradients = self._negative_gradients(y_ohe[:, k], probabilities[:, k])
                hessians = self._hessians(probabilities[:, k])
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X, negative_gradients)
                self._update_terminal_nodes(tree, X, negative_gradients, hessians)
                raw_predictions[:, k] += self.learning_rate * tree.predict(X)
                probabilities = self._softmax(raw_predictions)
                class_trees.append(tree)
            self.boosters.append(class_trees)


    def _one_hot_encode_labels(self, y):
        if isinstance(y, pd.Series): y = y.values
        ohe = OneHotEncoder()
        y_ohe = ohe.fit_transform(y.reshape(-1, 1)).toarray()
        return y_ohe

    def _negative_gradients(self, y_ohe, probabilities):
        return y_ohe - probabilities

    def _hessians(self, probabilities):
        return probabilities * (1 - probabilities)

    def _softmax(self, raw_predictions):
        numerator = np.exp(raw_predictions)
        denominator = np.sum(np.exp(raw_predictions), axis=1).reshape(-1, 1)
        return numerator / denominator

    def _update_terminal_nodes(self, tree, X, negative_gradients, hessians):
        '''Update the terminal node predicted values'''
        # terminal node id's
        leaf_nodes = np.nonzero(tree.tree_.children_left == -1)[0]
        # compute leaf for each sample in ``X``.
        leaf_node_for_each_sample = tree.apply(X)
        for leaf in leaf_nodes:
            samples_in_this_leaf = np.where(leaf_node_for_each_sample == leaf)[0]
            negative_gradients_in_leaf = negative_gradients.take(samples_in_this_leaf, axis=0)
            hessians_in_leaf = hessians.take(samples_in_this_leaf, axis=0)
            val = np.sum(negative_gradients_in_leaf) / np.sum(hessians_in_leaf)
            tree.tree_.value[leaf, 0, 0] = val

    def predict_proba(self, X):
        '''Generate probability predictions for the given input data.'''
        raw_predictions =  np.zeros(shape=(X.shape[0], self.n_classes))
        for k in range(self.n_classes):
            for booster in self.boosters:
                raw_predictions[:, k] +=self.learning_rate * booster[k].predict(X)
        probabilities = self._softmax(raw_predictions)
        return probabilities

    def predict(self, X):
        '''Generate predicted labels (as 1-d array)'''
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)

## Test

In [41]:
boosting = Boosting(n_estimators, alpha, max_depth)
boosting.fit(X_train, y_train)
y_pred = boosting.predict(X_test)

# Compute accuracy, precision, and recall on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Boosting Accuracy: {(accuracy*100):.4f}%")
print(f"Boosting Precision: {(precision*100):.4f}%")
print(f"Boosting Recall: {(recall*100):.4f}%")

Boosting Accuracy: 69.8010%
Boosting Precision: 69.7791%
Boosting Recall: 69.8010%


## **3. Random Forests**

Resources: [Random Forest](https://insidelearningmachines.com/build-a-random-forest-in-python/)

In [35]:
n_trees = 10 # Number of trees in the forest
max_depth = 3 # Maximum depth of the tree
min_samples_split = 2 # The minimum number of samples required to split an internal node

In [36]:
class RandomForest():
    from sklearn.tree import DecisionTreeClassifier

class RandomForest():
    def __init__(self, n_trees, max_depth, min_samples_split, criterion='gini', balance_class_weights=False):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion  # Update the parameter name to 'criterion'
        self.balance_class_weights = balance_class_weights
        self.trees = []

    # protected function to obtain the right decision tree
    def _make_tree_model(self):
        return DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion, class_weight='balanced' if self.balance_class_weights else None)

    # private function to make bootstrap samples
    def __make_bootstraps(self,data):
        # initialize output dictionary & unique value count
        dc = {}
        unip = 0
        # get sample size
        b_size = data.shape[0]
        # get list of row indexes
        idx = [i for i in range(b_size)]
        # loop through the required number of bootstraps
        for b in range(self.n_trees):
            # obtain boostrap samples with replacement
            sidx = np.random.choice(idx,replace=True,size=b_size)
            b_samp = data[sidx,:]
            #compute number of unique values contained in the bootstrap sample
            unip += len(set(sidx))
            # obtain out-of-bag samples for the current b
            oidx = list(set(idx) - set(sidx))
            o_samp = np.array([])
            if oidx:
                o_samp = data[oidx,:]
            #store results
            dc['boot_'+str(b)] = {'boot':b_samp,'test':o_samp}
        #return the bootstrap results
        return(dc)

    # protected function to train the ensemble
    def _train(self,X_train,y_train):
        #package the input data
        training_data = np.concatenate((X_train,y_train.reshape(-1,1)),axis=1)
        #make bootstrap samples
        dcBoot = self.__make_bootstraps(training_data)
        #iterate through each bootstrap sample & fit a model ##
        tree_m = self._make_tree_model()
        dcOob = {}
        for b in dcBoot:
            # make a clone of the model
            model = clone(tree_m)
            # fit a decision tree model to the current sample
            model.fit(dcBoot[b]['boot'][:,:-1],dcBoot[b]['boot'][:,-1].reshape(-1, 1))
            # append the fitted model
            self.trees.append(model)
            # store the out-of-bag test set for the current bootstrap
            if dcBoot[b]['test'].size:
                dcOob[b] = dcBoot[b]['test']
            else:
                dcOob[b] = np.array([])
        #return the oob data set
        return(dcOob)

    # train the ensemble
    def fit(self, X_train, y_train,print_metrics=False):
        # call the protected training method
        dcOob = self._train(X_train,y_train)
        # if selected, compute the standard errors and print them
        if print_metrics:
            # initialise metric arrays
            accs = np.array([])
            pres = np.array([])
            recs = np.array([])
            # loop through each bootstrap sample
            for b,m in zip(dcOob,self.trees):
                # compute the predictions on the out-of-bag test set & compute metrics
                if dcOob[b].size:
                    yp  = m.predict(dcOob[b][:,:-1])
                    acc = accuracy_score(dcOob[b][:,-1],yp)
                    pre = precision_score(dcOob[b][:,-1],yp,average='weighted')
                    rec = recall_score(dcOob[b][:,-1],yp,average='weighted')

    #protected function to predict from the ensemble
    def _predict(self,X):
        #check we've fit the ensemble
        if not self.trees:
            print('You must train the ensemble before making predictions!')
            return(None)
        #loop through each fitted model
        predictions = []
        for m in self.trees:
            #make predictions on the input X
            yp = m.predict(X)
            #append predictions to storage list
            predictions.append(yp.reshape(-1,1))
        #compute the ensemble prediction
        ypred = np.mean(np.concatenate(predictions,axis=1),axis=1)
        #return the prediction
        return(ypred)

    # predict from the ensemble
    def predict(self,X):
        # call the protected prediction method
        ypred = self._predict(X)
        # convert the results into integer values & return
        return(np.round(ypred).astype(int))

## Test

In [37]:
# Convert X_train and X_test to NumPy arrays
# X_train = np.array(X_train)
# X_test = np.array(X_test)
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

random_forest = RandomForest(n_trees ,max_depth, min_samples_split)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

# Compute accuracy, precision, and recall on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print results
print(f"Random Forest Accuracy: {(accuracy*100):.4f}%")
print(f"Random Forest Precision: {(precision*100):.4f}%")
print(f"Random Forest Recall: {(recall*100):.4f}%")

Random Forest Accuracy: 69.8010%
Random Forest Precision: 69.7791%
Random Forest Recall: 69.8010%


## **4.Hyperparameter Tuning**

Bagging->n_estimators

In [19]:
# Define the range of n_estimators to test
n_estimators_range = range(1, 30)  # From 1 to 20 estimators
accuracies = []

# Test different values of n_estimators
for n_estimators in n_estimators_range:
    # Initialize the Bagging model
    bagging = Bagging(n_estimators=n_estimators)
    
    # Train the model
    bagging.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = bagging.predict(X_test)
    
    # Compute and store accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    print(f"n_estimators: {n_estimators}, Accuracy: {acc:.4f}")




n_estimators: 1, Accuracy: 0.6393
n_estimators: 2, Accuracy: 0.6312
n_estimators: 3, Accuracy: 0.6616
n_estimators: 4, Accuracy: 0.6557
n_estimators: 5, Accuracy: 0.6731
n_estimators: 6, Accuracy: 0.6665
n_estimators: 7, Accuracy: 0.6798
n_estimators: 8, Accuracy: 0.6750
n_estimators: 9, Accuracy: 0.6838
n_estimators: 10, Accuracy: 0.6833
n_estimators: 11, Accuracy: 0.6872
n_estimators: 12, Accuracy: 0.6868
n_estimators: 13, Accuracy: 0.6944
n_estimators: 14, Accuracy: 0.6905
n_estimators: 15, Accuracy: 0.6964
n_estimators: 16, Accuracy: 0.6947
n_estimators: 17, Accuracy: 0.6935
n_estimators: 18, Accuracy: 0.6935
n_estimators: 19, Accuracy: 0.6997
n_estimators: 20, Accuracy: 0.6937
n_estimators: 21, Accuracy: 0.6982
n_estimators: 22, Accuracy: 0.6940
n_estimators: 23, Accuracy: 0.7000
n_estimators: 24, Accuracy: 0.6987
n_estimators: 25, Accuracy: 0.7005
n_estimators: 26, Accuracy: 0.6969
n_estimators: 27, Accuracy: 0.7018
n_estimators: 28, Accuracy: 0.6992
n_estimators: 29, Accuracy: 0

Boosting->n_estimators,alpha,max_depth

In [44]:


# Results storage
results = []

# Tune `n_estimators` (1 to 30) with fixed learning_rate and max_depth
print("Tuning n_estimators...")
for n_estimators in range(1, 31):  # Test values from 1 to 30
    boosting = Boosting(n_estimators=n_estimators, learning_rate=0.01, max_depth=3)
    boosting.fit(X_train, y_train)
    y_pred = boosting.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'hyperparameter': 'n_estimators', 'value': n_estimators, 'accuracy': accuracy})
    print(f"n_estimators: {n_estimators}, Accuracy: {accuracy:.4f}")

print("\nFinished tuning n_estimators.\n")

# Tune `learning_rate` (0.01, 0.05, 0.1, 1) with fixed n_estimators and max_depth
print("Tuning learning_rate...")
for learning_rate in [0.01, 0.05, 0.1, 1]:
    boosting = Boosting(n_estimators=10, learning_rate=learning_rate, max_depth=3)
    boosting.fit(X_train, y_train)
    y_pred = boosting.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'hyperparameter': 'learning_rate', 'value': learning_rate, 'accuracy': accuracy})
    print(f"learning_rate: {learning_rate}, Accuracy: {accuracy:.4f}")

print("\nFinished tuning learning_rate.\n")

# Tune `max_depth` (1 to 5) with fixed n_estimators and learning_rate
print("Tuning max_depth...")
for max_depth in range(1, 6):  # Test values from 1 to 5
    boosting = Boosting(n_estimators=10, learning_rate=0.01, max_depth=max_depth)
    boosting.fit(X_train, y_train)
    y_pred = boosting.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'hyperparameter': 'max_depth', 'value': max_depth, 'accuracy': accuracy})
    print(f"max_depth: {max_depth}, Accuracy: {accuracy:.4f}")

print("\nFinished tuning max_depth.\n")

# Display all results
print("Hyperparameter Tuning Results:")
for result in results:
    print(result)


Tuning n_estimators...
n_estimators: 1, Accuracy: 0.6980
n_estimators: 2, Accuracy: 0.6980
n_estimators: 3, Accuracy: 0.6980
n_estimators: 4, Accuracy: 0.6980
n_estimators: 5, Accuracy: 0.6980
n_estimators: 6, Accuracy: 0.6980
n_estimators: 7, Accuracy: 0.6980
n_estimators: 8, Accuracy: 0.6980
n_estimators: 9, Accuracy: 0.6980
n_estimators: 10, Accuracy: 0.6980
n_estimators: 11, Accuracy: 0.6980
n_estimators: 12, Accuracy: 0.6980
n_estimators: 13, Accuracy: 0.6980
n_estimators: 14, Accuracy: 0.6995
n_estimators: 15, Accuracy: 0.6995
n_estimators: 16, Accuracy: 0.6995
n_estimators: 17, Accuracy: 0.6995
n_estimators: 18, Accuracy: 0.6995
n_estimators: 19, Accuracy: 0.6995
n_estimators: 20, Accuracy: 0.6995
n_estimators: 21, Accuracy: 0.6995
n_estimators: 22, Accuracy: 0.6995
n_estimators: 23, Accuracy: 0.6995
n_estimators: 24, Accuracy: 0.6995
n_estimators: 25, Accuracy: 0.6995
n_estimators: 26, Accuracy: 0.6995
n_estimators: 27, Accuracy: 0.6995
n_estimators: 28, Accuracy: 0.6995
n_esti