Random Forest
 A Random Forest classifier that combines multiple decision trees for better accuracy.
    Each tree is trained on a random subset of data and features to improve model diversity.

    Parameters:
    -----------
    n_estimators: int
        How many trees to create in the forest.
    max_features: int
        The maximum number of features each tree can use to make its splits.
    min_samples_split: int
        Minimum samples required to split an internal node.
    min_gain: float
        Minimum decrease in impurity needed to continue splitting.
    max_depth: int
        Maximum depth of each tree to prevent overfitting.

In [None]:
from __future__ import division, print_function
import numpy as np
import math
import progressbar

# Import helper modules and functions
from mlfromscratch.utils import divide_on_feature, train_test_split, get_random_subsets, normalize
from mlfromscratch.utils import accuracy_score, calculate_entropy
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.supervised_learning import ClassificationTree
from mlfromscratch.utils.misc import bar_widgets
from mlfromscratch.utils import Plot


class RandomForest:

    def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
                 min_gain=0, max_depth=float("inf")):
        # Store parameters to control tree and forest behavior
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.min_samples_split = min_samples_split
        self.min_gain = min_gain
        self.max_depth = max_depth
        self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)

        # Set up an empty list to hold our trees
        self.trees = []
        # Initialize each tree with specified parameters
        for _ in range(n_estimators):
            tree = ClassificationTree(
                min_samples_split=self.min_samples_split,
                min_impurity=min_gain,
                max_depth=self.max_depth
            )
            self.trees.append(tree)

    def fit(self, X, y):
        # Get the total number of features in the dataset
        n_features = np.shape(X)[1]

        # If max_features is not defined, we’ll set it to sqrt(n_features)
        if not self.max_features:
            self.max_features = int(math.sqrt(n_features))

        # Create a list of random subsets of the data (one for each tree)
        subsets = get_random_subsets(X, y, self.n_estimators)

        # Train each tree on its own random subset of data
        for i in self.progressbar(range(self.n_estimators)):
            X_subset, y_subset = subsets[i]

            # Randomly select which features to use for this tree
            feature_indices = np.random.choice(range(n_features), size=self.max_features, replace=True)

            # Save these feature indices for later (each tree will have its own set)
            self.trees[i].feature_indices = feature_indices

            # Reduce our subset to only the chosen features
            X_subset = X_subset[:, feature_indices]

            # Now, train the tree on this modified subset
            self.trees[i].fit(X_subset, y_subset)

    def predict(self, X):
        # We’ll store each tree's predictions in this array
        y_preds = np.empty((X.shape[0], len(self.trees)))

        # Loop over each tree and get its predictions
        for i, tree in enumerate(self.trees):
            # Use only the features this tree was trained on
            feature_indices = tree.feature_indices
            # Get the tree's predictions on the given samples
            predictions = tree.predict(X[:, feature_indices])
            # Store these predictions in our array
            y_preds[:, i] = predictions

        # Now combine each sample's predictions from all trees
        y_pred = [np.bincount(sample_preds.astype(int)).argmax() for sample_preds in y_preds]
        return y_pred
