### Random Forests

Random Forests are an ensemble learning method that builds multiple decision trees and merges their results to improve prediction accuracy and control over-fitting.

**Key Steps:**

1. **Tree Construction:**
   - **Create as many trees as you want:** Each tree is built using a different subset of data, which is randomly sampled.
   - **Random Subsets:** For each tree, a random subset of features is used to create splits, introducing diversity in the ensemble.
   - **Majority Vote Principle:** The final prediction is based on combining the outcomes of all trees.

2. **Prediction:**
   - **Get the predictions from each tree.**
   - **If**
     - **Classification:** The final class label is determined by a majority vote (the class that gets the most votes from the individual trees is selected).
     - **Regression:** The final output is the average of all the predictions from the trees.

In [1]:
import numpy as np
from collections import Counter

from utils.decision_tree import DecisionTree

In [2]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, n_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_features = n_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
             tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, n_features=self.n_features)

        X_sample, y_sample = self._bootstrap_samples(X,y)
        tree.fit(X_sample, y_sample)
        self.trees.append(tree)

    def predict(self,X):
        # [[1,0,1], [0,0,1] ----> predictions for the same sample from different trees in the same inner list]
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label_2(pred) for pred in tree_preds])
        return predictions


    # Helper Function
    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs =  np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]
    
    def _most_common_label_2(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

In [3]:
# Train & Test

from sklearn import datasets
from sklearn.model_selection import train_test_split

data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234
)

clf = RandomForest(n_trees=20)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, predictions)
print(acc)

0.9210526315789473
