[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Humboldt-WI/IPML/blob/master/tutorial_notebooks/t11_bagged_trees.ipynb)

# Bagged Trees

## Visualization

<div>
<img src="bagging.gif" width="500"/>
</div>

## Preliminaries

In [126]:
import pandas as pd
import numpy as np
import statistics as st

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [127]:
def bootstrapping(X, y):
    bootstrap_indices = np.random.randint(low=0, high=len(X), size=len(X))
    X_bootstrapped = X.iloc[bootstrap_indices]
    y_bootstrapped = y.iloc[bootstrap_indices]
    return X_bootstrapped, y_bootstrapped

## Bagged Trees from Scratch

- Initialization: Prepare a list to hold individual decision trees and another for their predictions.
- Training:
    - For each tree to be created, generate a bootstrap sample from the original dataset.
    - Train a decision tree on this bootstrap sample.
    - Store the trained tree.
- Prediction:
    - For a given input, use each tree in the ensemble to make a prediction.
    - Aggregate these predictions into a final prediction. The aggregation method depends on whether the task is regression (use the average of predictions) or classification (use majority voting).

In [135]:
class bagged_trees():

    def __init__(self, n_trees: int):
        '''Class for training and predicting with a bootstrap aggregated tree ensemble for binary classification.
    
        Args:
            n_trees (int): number of trees in the ensemble
        '''
        self.n_trees = n_trees  # initialize number of trees

    def fit(self, X, y):
        '''Trains ensemble on given features and targets.
        
        Args:
            X (pd.DataFrame or np.array): Matrix of features
            y (pd.Series or np.array): Vector of targets
        '''
        self.trees = []
        for _ in range(self.n_trees):
            # create a bootstrap sample from the original dataset
            X_bs, y_bs = bootstrapping(X, y)
            # train one tree on the bootstrap sample and store it
            self.trees.append(DecisionTreeClassifier().fit(X_bs, y_bs))
        return self  # return instance of the class to enable method chaining

    def predict_proba(self, X):
        '''Predicts using the tree ensemble for a given set of features.
        
        Args:
            X (pd.DataFrame or np.array): Matrix of features
            
        Returns:
            np.array: Vector of continuous predictions for each observation
            
        Raises:
            AttributeError if function `fit` was not run beforehand
        '''
        predictions = []
        for tree in self.trees:
            predictions.append(tree.predict(X))  # predict with each tree
        return np.mean(predictions, axis=0) # aggregate predictions

### Run on data set

In [136]:
# Download the data from the web
url = 'https://raw.githubusercontent.com/Humboldt-WI/bads/master/data/hmeq.csv'
df = pd.read_csv(url)  # standard pandas function to load tabular data in CSV format

# Missing value handling
for col in df.columns:  # loop through all the columns (i.e., features)
    if df[col].dtype == 'O':  # decide on the imputation strategy based on the data type
        df[col].fillna(df[col].mode()[0], inplace=True)  # mode replacement for categories
    else:
        df[col].fillna(df[col].median(), inplace=True)  # mean replacement for all other features 

# Dummy coding of the (two) categorical variables
df = pd.get_dummies(data=df, drop_first=True)

# Create default variables names X, y for further analysis
# We use the suffix _cls to highlight that this data facilitates regression
X = df.copy()
y = X.pop('BAD')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [145]:
predictions = bagged_trees(n_trees=1000).fit(X_train, y_train).predict_proba(X_test)
print(f'AUC ROC:\t{roc_auc_score(y_test, predictions):.4f}')

AUC ROC:	0.9495


## Sklearn implementation

In [143]:
tree_preds = DecisionTreeClassifier(random_state=42).fit(X_train, y_train).predict_proba(X_test)[:,1]
bagging_preds = BaggingClassifier(n_estimators=1000, random_state=42).fit(X_train, y_train).predict_proba(X_test)[:,1]
forest_preds = RandomForestClassifier(n_estimators=1000, random_state=42).fit(X_train, y_train).predict_proba(X_test)[:,1]

In [144]:
print(f'Single Decision Tree AUC ROC:\t\t{roc_auc_score(y_test, tree_preds):.4f}')
print(f'Bagged Tree Ensemble AUC ROC:\t\t{roc_auc_score(y_test, bagging_preds):.4f}')
print(f'Random Forest Ensemble AUC ROC:\t\t{roc_auc_score(y_test, forest_preds):.4f}')

Single Decision Tree AUC ROC:		0.7949
Bagged Tree Ensemble AUC ROC:		0.9491
Random Forest Ensemble AUC ROC:		0.9676
