1. Ensemble methods
    - Bagging
    - Boosting
    - Random Forests
2. Hyperparameter Tuning
3. Final System

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv("output.csv")
df.head()

Unnamed: 0,systolic,eyesight(left),hearing(right),ALT,relaxation,Cholesterol,AST,hearing(left),smoking,serum creatinine,Gtp,serum creatinine^2,Gtp^2
0,0.981702,-1.257856,1,-0.087326,1.125777,-0.837985,-0.37157,1,1,0.597927,-0.295342,0.357517,0.087227
1,1.845852,-1.009169,2,-0.199983,0.681066,-0.063252,0.1567,2,0,1.155511,0.025124,1.335205,0.000631
2,-0.353802,-1.506543,1,0.250645,-0.208355,-0.626695,0.1567,1,1,-0.517239,0.53787,0.267536,0.289304
3,0.667465,1.229017,1,0.025331,1.236955,-0.556264,-0.582878,1,0,0.597927,-0.199202,0.357517,0.039681
4,-0.118125,1.229017,1,-0.763267,-0.097177,-1.436643,-0.688532,1,1,-0.517239,-0.615808,0.267536,0.379219


In [6]:
X = df.drop('smoking', axis=1)
y = df['smoking']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(127404, 12)
(31852, 12)


# **1. Bagging**

# **2. Boosting**

Resources: [Boosting](https://randomrealizations.com/posts/gradient-boosting-multi-class-classification-from-scratch/)

## Hyperparameters

In [10]:
n_estimators = 10  # Number of boosting iterations
alpha = 0.01  # learning rate
max_depth = 3 # maximum tree depth

In [11]:
class Boosting():
    '''Gradient Boosting Classifier from Scratch.

    Parameters
    ----------
    n_estimators : int
        number of boosting rounds

    learning_rate : float
        learning rate hyperparameter

    max_depth : int
        maximum tree depth
    '''

    def __init__(self, n_estimators, learning_rate=0.1, max_depth=1):
        self.n_estimators=n_estimators;
        self.learning_rate=learning_rate
        self.max_depth=max_depth;

    def fit(self, X, y):
        '''Fit the GBM

        Parameters
        ----------
        X : ndarray of size (number observations, number features)
            design matrix

        y : ndarray of size (number observations,)
            integer-encoded target labels in {0,1,...,k-1}
        '''

        self.n_classes = pd.Series(y).nunique()
        y_ohe = self._one_hot_encode_labels(y)

        raw_predictions = np.zeros(shape=y_ohe.shape)
        probabilities = self._softmax(raw_predictions)
        self.boosters = []
        for m in range(self.n_estimators):
            class_trees = []
            for k in range(self.n_classes):
                negative_gradients = self._negative_gradients(y_ohe[:, k], probabilities[:, k])
                hessians = self._hessians(probabilities[:, k])
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X, negative_gradients);
                self._update_terminal_nodes(tree, X, negative_gradients, hessians)
                raw_predictions[:, k] += self.learning_rate * tree.predict(X)
                probabilities = self._softmax(raw_predictions)
                class_trees.append(tree)
            self.boosters.append(class_trees)

    def _one_hot_encode_labels(self, y):
        if isinstance(y, pd.Series): y = y.values
        ohe = OneHotEncoder()
        y_ohe = ohe.fit_transform(y.reshape(-1, 1)).toarray()
        return y_ohe

    def _negative_gradients(self, y_ohe, probabilities):
        return y_ohe - probabilities

    def _hessians(self, probabilities):
        return probabilities * (1 - probabilities)

    def _softmax(self, raw_predictions):
        numerator = np.exp(raw_predictions)
        denominator = np.sum(np.exp(raw_predictions), axis=1).reshape(-1, 1)
        return numerator / denominator

    def _update_terminal_nodes(self, tree, X, negative_gradients, hessians):
        '''Update the terminal node predicted values'''
        # terminal node id's
        leaf_nodes = np.nonzero(tree.tree_.children_left == -1)[0]
        # compute leaf for each sample in ``X``.
        leaf_node_for_each_sample = tree.apply(X)
        for leaf in leaf_nodes:
            samples_in_this_leaf = np.where(leaf_node_for_each_sample == leaf)[0]
            negative_gradients_in_leaf = negative_gradients.take(samples_in_this_leaf, axis=0)
            hessians_in_leaf = hessians.take(samples_in_this_leaf, axis=0)
            val = np.sum(negative_gradients_in_leaf) / np.sum(hessians_in_leaf)
            tree.tree_.value[leaf, 0, 0] = val

    def predict_proba(self, X):
        '''Generate probability predictions for the given input data.'''
        raw_predictions =  np.zeros(shape=(X.shape[0], self.n_classes))
        for k in range(self.n_classes):
            for booster in self.boosters:
                raw_predictions[:, k] +=self.learning_rate * booster[k].predict(X)
        probabilities = self._softmax(raw_predictions)
        return probabilities

    def predict(self, X):
        '''Generate predicted labels (as 1-d array)'''
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)

## Test

In [13]:
model = Boosting(n_estimators, alpha, max_depth)
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"Boosting Accuracy: {(accuracy*100):.4f}%")

Boosting Accuracy: 69.8010%


## **3. Random Forests**

Resources: [Random Forest](https://youtu.be/ErkHne_Mf7g?si=v6MwL9ku8Hh4SSNM)