<a href="https://colab.research.google.com/github/MLcmore2023/MLcmore2023/blob/main/day3_pm_afternoon/random-forest-demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest
Random forest is an ensemble machine learning algorithm that combines multiple decision trees to make accurate predictions or classifications. It works by training numerous decision trees on different subsets of the dataset using bootstrapping (random sampling with replacement) and feature randomization. These trees collectively form a "forest," and their predictions are averaged or voted upon to provide a more robust and accurate outcome, reducing overfitting and improving generalization to new data. This approach enhances predictive performance by leveraging the diversity and collective intelligence of the individual trees.

<img src="https://www.tibco.com/sites/tibco/files/media_entity/2021-05/random-forest-diagram.svg">


In [None]:
import random
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
df.loc[df['Age'].isnull(),'Age'] = np.round(df['Age'].mean())
df.loc[df['Embarked'].isnull(),'Embarked'] = df['Embarked'].value_counts().index[0]

In [None]:
features = ['Pclass','Sex','Age','SibSp','Parch', 'Fare', 'Embarked']
nb_train = int(np.floor(0.9 * len(df)))
df = df.sample(frac=1, random_state=217)
X_train = df[features][:nb_train]
y_train = df['Survived'][:nb_train].values
X_test = df[features][nb_train:]
y_test = df['Survived'][nb_train:].values

In [None]:
def entropy(p):
    """
    Calculate the entropy for a given probability.

    Parameters:
    - p (float): Probability

    Returns:
    - float: Entropy value
    """
    if p == 0:
        return 0
    elif p == 1:
        return 0
    else:
        return - (p * np.log2(p) + (1 - p) * np.log2(1-p))

def information_gain(left_child, right_child):
    """
    Calculate the information gain based on the left and right child nodes.

    Parameters:
    - left_child (list): Labels of the left child node
    - right_child (list): Labels of the right child node

    Returns:
    - float: Information gain value
    """

    parent = left_child + right_child
    p_parent = parent.count(1) / len(parent) if len(parent) > 0 else 0
    p_left = left_child.count(1) / len(left_child) if len(left_child) > 0 else 0
    p_right = right_child.count(1) / len(right_child) if len(right_child) > 0 else 0
    IG_p = entropy(p_parent)
    IG_l = entropy(p_left)
    IG_r = entropy(p_right)
    return IG_p - len(left_child) / len(parent) * IG_l - len(right_child) / len(parent) * IG_r

In [None]:
def draw_bootstrap(X_train, y_train):
    """
    Create a bootstrap sample from the training data.

    Parameters:
    - X_train (DataFrame): Features of the training data
    - y_train (array): Labels of the training data

    Returns:
    - tuple: Bootstrap sample of features and labels, Out-of-Bag sample of features and labels
    """

    bootstrap_indices = list(np.random.choice(range(len(X_train)), len(X_train), replace = True))
    oob_indices = [i for i in range(len(X_train)) if i not in bootstrap_indices]
    X_bootstrap = X_train.iloc[bootstrap_indices].values
    y_bootstrap = y_train[bootstrap_indices]
    X_oob = X_train.iloc[oob_indices].values
    y_oob = y_train[oob_indices]
    return X_bootstrap, y_bootstrap, X_oob, y_oob

def oob_score(tree, X_test, y_test):
    """
    Calculate the out-of-bag (OOB) score for a given decision tree.

    Parameters:
    - tree (dict): Decision tree
    - X_test (DataFrame): Features of the test data
    - y_test (array): Labels of the test data

    Returns:
    - float: OOB score
    """
    mis_label = 0
    for i in range(len(X_test)):
        pred = predict_tree(tree, X_test[i])
        if pred != y_test[i]:
            mis_label += 1
    return mis_label / len(X_test)

In [None]:
def find_split_point(X_bootstrap, y_bootstrap, max_features):
    """
    Find the best split point for a given bootstrap sample.

    Parameters:
    - X_bootstrap (array): Features of the bootstrap sample
    - y_bootstrap (array): Labels of the bootstrap sample
    - max_features (int): Maximum number of features to consider

    Returns:
    - dict: Split node information
    """
    feature_ls = list()
    num_features = len(X_bootstrap[0])

    while len(feature_ls) <= max_features:
        feature_idx = random.sample(range(num_features), 1)
        if feature_idx not in feature_ls:
            feature_ls.extend(feature_idx)

    best_info_gain = -999
    node = None
    for feature_idx in feature_ls:
        for split_point in X_bootstrap[:,feature_idx]:
            left_child = {'X_bootstrap': [], 'y_bootstrap': []}
            right_child = {'X_bootstrap': [], 'y_bootstrap': []}

            # split children for continuous variables
            if type(split_point) in [int, float]:
                for i, value in enumerate(X_bootstrap[:,feature_idx]):
                    if value <= split_point:
                        left_child['X_bootstrap'].append(X_bootstrap[i])
                        left_child['y_bootstrap'].append(y_bootstrap[i])
                    else:
                        right_child['X_bootstrap'].append(X_bootstrap[i])
                        right_child['y_bootstrap'].append(y_bootstrap[i])
            # split children for categoric variables
            else:
                for i, value in enumerate(X_bootstrap[:,feature_idx]):
                    if value == split_point:
                        left_child['X_bootstrap'].append(X_bootstrap[i])
                        left_child['y_bootstrap'].append(y_bootstrap[i])
                    else:
                        right_child['X_bootstrap'].append(X_bootstrap[i])
                        right_child['y_bootstrap'].append(y_bootstrap[i])

            split_info_gain = information_gain(left_child['y_bootstrap'], right_child['y_bootstrap'])
            if split_info_gain > best_info_gain:
                best_info_gain = split_info_gain
                left_child['X_bootstrap'] = np.array(left_child['X_bootstrap'])
                right_child['X_bootstrap'] = np.array(right_child['X_bootstrap'])
                node = {'information_gain': split_info_gain,
                        'left_child': left_child,
                        'right_child': right_child,
                        'split_point': split_point,
                        'feature_idx': feature_idx}


    return node

In [None]:
def terminal_node(node):
    """
    Create a terminal node for the decision tree.

    Parameters:
    - node (dict): Node information

    Returns:
    - int: Predicted label for the terminal node
    """
    y_bootstrap = node['y_bootstrap']
    pred = max(y_bootstrap, key = y_bootstrap.count)
    return pred


def split_node(node, max_features, min_samples_split, max_depth, depth):
    """
    Split a node into left and right child nodes.

    Parameters:
    - node (dict): Node to split
    - max_features (int): Maximum number of features to consider
    - min_samples_split (int): Minimum number of samples required to split a node
    - max_depth (int): Maximum depth of the decision tree
    - depth (int): Current depth of the node

    Returns:
    - None
    """

    left_child = node['left_child']
    right_child = node['right_child']

    del(node['left_child'])
    del(node['right_child'])

    if len(left_child['y_bootstrap']) == 0 or len(right_child['y_bootstrap']) == 0:
        empty_child = {'y_bootstrap': left_child['y_bootstrap'] + right_child['y_bootstrap']}
        node['left_split'] = terminal_node(empty_child)
        node['right_split'] = terminal_node(empty_child)
        return

    if depth >= max_depth:
        node['left_split'] = terminal_node(left_child)
        node['right_split'] = terminal_node(right_child)
        return node

    if len(left_child['X_bootstrap']) <= min_samples_split:
        node['left_split'] = node['right_split'] = terminal_node(left_child)
    else:
        node['left_split'] = find_split_point(left_child['X_bootstrap'], left_child['y_bootstrap'], max_features)
        split_node(node['left_split'], max_depth, min_samples_split, max_depth, depth + 1)
    if len(right_child['X_bootstrap']) <= min_samples_split:
        node['right_split'] = node['left_split'] = terminal_node(right_child)
    else:
        node['right_split'] = find_split_point(right_child['X_bootstrap'], right_child['y_bootstrap'], max_features)
        split_node(node['right_split'], max_features, min_samples_split, max_depth, depth + 1)

In [None]:
def build_tree(X_bootstrap, y_bootstrap, max_depth, min_samples_split, max_features):
    """
    Build a decision tree using the bootstrap sample.

    Parameters:
    - X_bootstrap (array): Features of the bootstrap sample
    - y_bootstrap (array): Labels of the bootstrap sample
    - max_depth (int): Maximum depth of the decision tree
    - min_samples_split (int): Minimum number of samples required to split a node
    - max_features (int): Maximum number of features to consider

    Returns:
    - dict: Root node of the decision tree
    """

    root_node = find_split_point(X_bootstrap, y_bootstrap, max_features)
    split_node(root_node, max_features, min_samples_split, max_depth, 1)
    return root_node

def random_forest(X_train, y_train, n_estimators, max_features, max_depth, min_samples_split):
    """
    Build a random forest model.

    Parameters:
    - X_train (DataFrame): Features of the training data
    - y_train (array): Labels of the training data
    - n_estimators (int): Number of decision trees in the random forest
    - max_features (int): Maximum number of features to consider
    - max_depth (int): Maximum depth of the decision trees
    - min_samples_split (int): Minimum number of samples required to split a node

    Returns:
    - list: List of decision trees in the random forest
    """

    tree_ls = list()
    oob_ls = list()
    for i in range(n_estimators):
        X_bootstrap, y_bootstrap, X_oob, y_oob = draw_bootstrap(X_train, y_train)
        tree = build_tree(X_bootstrap, y_bootstrap, max_features, max_depth, min_samples_split)
        tree_ls.append(tree)
        oob_error = oob_score(tree, X_oob, y_oob)
        oob_ls.append(oob_error)
    print("OOB estimate: {:.2f}".format(np.mean(oob_ls)))
    return tree_ls

In [None]:
def predict_tree(tree, X_test):
    """
    Predict the label for a given instance using a decision tree.

    Parameters:
    - tree (dict): Decision tree
    - instance (array): Instance to predict

    Returns:
    - int: Predicted label
    """

    feature_idx = tree['feature_idx']

    if X_test[feature_idx] <= tree['split_point']:
        if type(tree['left_split']) == dict:
            return predict_tree(tree['left_split'], X_test)
        else:
            value = tree['left_split']
            return value
    else:
        if type(tree['right_split']) == dict:
            return predict_tree(tree['right_split'], X_test)
        else:
            return tree['right_split']

In [None]:
def predict_rf(tree_ls, X_test):
    """
    Predict the labels for a set of instances using a random forest.

    Parameters:
    - forest (list): List of decision trees in the random forest
    - X_test (DataFrame): Features of the test data

    Returns:
    - array: Predicted labels
    """




    pred_ls = list()
    for i in range(len(X_test)):
        ensemble_preds = [predict_tree(tree, X_test.values[i]) for tree in tree_ls]
        final_pred = max(ensemble_preds, key = ensemble_preds.count)
        pred_ls.append(final_pred)
    return np.array(pred_ls)

In [None]:
n_estimators = 100
max_features = 3
max_depth = 10
min_samples_split = 2

model = random_forest(X_train, y_train, n_estimators=100, max_features=3, max_depth=10, min_samples_split=2)

OOB estimate: 0.31


In [None]:
preds = predict_rf(model, X_test)

In [None]:
acc = sum(preds == y_test) / len(y_test)
print("Testing accuracy: {}".format(np.round(acc,3)))

Testing accuracy: 0.689


## References
- L. Breiman. Random forests. Maching Learning, 45(1):5â€“32, Oct. 2001. [[pdf]](https://link.springer.com/content/pdf/10.1023%2FA%3A1010933404324.pdf)
- https://carbonati.github.io/posts/random-forests-from-scratch/
- https://www.tibco.com/reference-center/what-is-a-random-forest