# Bagged Trees

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score

In [None]:
def bootstrapping(X, y):
    bootstrap_indices = np.random.randint(low=0, high=len(X), size=len(X))
    X_bootstrapped = X.iloc[bootstrap_indices]
    y_bootstrapped = y.iloc[bootstrap_indices]
    return X_bootstrapped, y_bootstrapped

## Bagged Trees Pseudocode

- Initialization: Prepare a list to hold individual decision trees and another for their predictions.
- Training:
    - For each tree to be created, generate a bootstrap sample from the original dataset.
    - Train a decision tree on this bootstrap sample.
    - Store the trained tree.
- Prediction:
    - For a given input, use each tree in the ensemble to make a prediction.
    - Aggregate these predictions into a final prediction. The aggregation method depends on whether the task is regression (use the average of predictions) or classification (use majority voting).

In [None]:
def bagged_trees(X, y, n_trees, max_depth):
    
    # bagged trees code here

### Run on data set

In [None]:
# Download the data from the web
url = 'https://raw.githubusercontent.com/Humboldt-WI/bads/master/data/hmeq.csv'
df = pd.read_csv(url)  # standard pandas function to load tabular data in CSV format

# Missing value handling
for col in df.columns:  # loop through all the columns (i.e., features)
    if df[col].dtype == 'O':  # decide on the imputation strategy based on the data type
        df[col].fillna(df[col].mode()[0], inplace=True)  # mode replacement for categories
    else:
        df[col].fillna(df[col].median(), inplace=True)  # mean replacement for all other features 

# Dummy coding of the (two) categorical variables
df = pd.get_dummies(data=df, drop_first=True)

# Create default variables names X, y for further analysis
# We use the suffix _cls to highlight that this data facilitates regression
X = df.copy()
y = X.pop('BAD')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# train and test bagged trees here

### Compare performance to single decision tree

In [None]:
# single decision tree code here