### Random Forest Implementation

Random forest is a bagging algorithm which is built on the top of decision tree. Bagging name comes from the 

#### B- Bootstrap
This means that the sampling of the whole dataset is done and the then the samples data is fed to the model. Each model is trained on a different set of sampled data

#### Agg- Aggregate
This means that the whole set of models are train on a different set of data and then while making their inference the aggregate of the result has been taken


#### Components

##### Decision Tree Class:
Constructs individual decision trees based on specified parameters like minimum samples and maximum depth.

##### Random Forest Function:
Builds a random forest using multiple decision trees trained on bootstrap samples.

##### Prediction Function:
Aggregates predictions from all decision trees in the forest to provide the final prediction using a majority vote approach.

In [1]:
import numpy as np
import pandas as pd

In [2]:
class DecisionTree:
    
    # constructor of the class DecisionTree
    def __init__(self, min_samples_left=2, max_depth=2):
        self.root = None
        self.min_samples_left = min_samples_left
        self.max_depth = max_depth

    
    ## function to build the decision tree
    def BuildTree(self, dataset, curr_depth=0):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_samples = X.shape[0]
        num_features = X.shape[1]
        
        #splitting on the best feature
        if num_samples >= self.min_samples_left and curr_depth <= self.max_depth:
            best_split = self.get_best_split(dataset, num_samples, num_features)
            if best_split["info_gain"] > 0:
                left_subtree = self.BuildTr(best_split["dataset_left"], curr_depth + 1)
                right_subtree = self.BuildTree(best_split[
                    "dataset_right"], curr_depth + 1)
                return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["info_gain"])

        leaf_value = self.calculate_leaf_value(y)
        return Node(value=leaf_value)
    
    # function to get the best split
    def get_best_split(self, dataset, num_samples, num_features):
        best_split = {}
        max_info_gain = -float("inf")

        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_threshold = np.unique(feature_values)

            for threshold in possible_threshold:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                    if curr_info_gain > max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain

        return best_split

    #split function to divide the data on a basis of a threshold of a feature
    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
        return dataset_left, dataset_right

    #information gain
    # 1- entropy
    # 2- gini impurity
    
    def information_gain(self, y, left_y, right_y, mode="entropy"):
        weight_l = len(left_y) / len(y)
        weight_r = len(right_y) / len(y)

        if mode == "gini":
            gain = self.gini_impurity(y) - (weight_l * self.gini_impurity(left_y) + weight_r * self.gini_impurity(right_y))
        else:
            gain = self.entropy(y) - (weight_l * self.entropy(left_y) + weight_r * self.entropy(right_y))

        return gain

    def entropy(self, y):
        labels = np.unique(y)
        entropy = 0
        for cls in labels:
            p_class = len(y[y == cls]) / len(y)
            entropy += -p_class * np.log2(p_class)
        return entropy

    def gini_impurity(self, y):
        labels = np.unique(y)
        gini = 0
        for cls in labels:
            p_class = len(y[y == cls]) / len(y)
            gini += p_class ** 2
        return 1 - gini

    def calculate_leaf_value(self, y):
        y = list(y)
        return max(y, key=y.count)
    
    #model training
    def fit(self, X, y):
        data = np.concatenate((X, y.reshape(-1, 1)), axis=1)
        self.root = self.BuildTree(data)

    def predict(self, X):
        predictions = [self.make_prediction(x, self.root) for x in X]
        return predictions

    #making model inference
    def make_prediction(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [3]:
def Bootstrap_sample(data, bootstrap_size):
    random_index = np.random.randint(0, data.shape[0], size = bootstrap_size)
    
    return data.iloc[random_index]

In [4]:
def Random_Forest(data, bootstrap_size, random_attributes, random_splits, forest_size=20, tree_depth=1000):
    forest = []
    for i in range(forest_size):
        bootstrap_data = Bootstrap_sample(data, bootstrap_size)
        
        # Create an instance of the DecisionTree with the required parameters
        dtree = DecisionTree(min_samples_left=2, max_depth=tree_depth)
        decision_tree = dtree.BuildTree(bootstrap_data.values, curr_depth=0)
        forest.append(decision_tree)
    return forest

In [5]:
def RandomForestPredictions(dataFrame, randomForest):
    predictions = {}
    for i, tree in enumerate(randomForest):
        column = f"decision tree {i}"
        # Ensure the method to predict from the DecisionTree class is correctly called
        predictions[column] = [tree.predict(row) for _, row in dataFrame.iterrows()]
    predictions_df = pd.DataFrame(predictions)
    return predictions_df.mode(axis=1)[0]
