In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, error_function='gini'):
        self.error_function = error_function
        self.model = {}

        #Calculate the Gini impurity
    def gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities**2)

        #Calculate the entropy
    def entropy(self, y):
        from scipy.stats import entropy
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return entropy(probabilities, base=2)

        #Train the decision tree
    def train(self, X, y, hyperparameters):
        unique_classes = np.unique(y)
        if len(unique_classes) == 1:
            self.model = {'label': unique_classes[0]}
        else:
            best_gini = float('inf')
            best_split = None
            best_left_labels = None
            best_right_labels = None

            # Search for the best split
            for feature in range(X.shape[1]):
                thresholds = np.unique(X[:, feature])
                for threshold in thresholds:
                    left_indices = X[:, feature] <= threshold
                    right_indices = X[:, feature] > threshold
                    left_labels = y[left_indices]
                    right_labels = y[right_indices]

                    gini_left = self.gini(left_labels)
                    gini_right = self.gini(right_labels)
                    weighted_gini = (len(left_labels) / len(y)) * gini_left + (len(right_labels) / len(y)) * gini_right

                    if weighted_gini < best_gini:
                        best_gini = weighted_gini
                        best_split = threshold
                        best_left_labels = left_labels
                        best_right_labels = right_labels

            self.model = {
                'feature': 0,
                'threshold': best_split,
                'children': {
                    'left': {'label': np.argmax(np.bincount(best_left_labels))},
                    'right': {'label': np.argmax(np.bincount(best_right_labels))}
                }
            }

        #Predict the labels for the data
    def predict(self, X):
        predictions = []
        for row in X:
            node = self.model
            while 'children' in node:
                if row[node['feature']] <= node['threshold']:
                    node = node['children']['left']
                else:
                    node = node['children']['right']
            predictions.append(node['label'])
        return predictions

class RandomForest:
    def __init__(self):
        self.trees = []
        self.num_trees = 0

        #Train the random forest using bagging.
    def train(self, X, y, hyperparameters):
        self.num_trees = hyperparameters.get('num_trees', 10)
        error_function = hyperparameters.get('error_function', 'gini')
        self.trees = []
        for _ in range(self.num_trees):
            # Shuffle the dataset
            indices = np.arange(len(X))
            np.random.shuffle(indices)
            subset_size = int(0.8 * len(X))
            subset_indices = indices[:subset_size]
            # Subset the data
            X_subset = X[subset_indices]
            y_subset = y[subset_indices]
            # Train a decision tree
            tree = DecisionTree(error_function=error_function)
            tree.train(X_subset, y_subset, {'error_function': error_function})
            self.trees.append(tree)

        #Predict the labels using the random forest (majority vote)."""
    def predict(self, X):
        if not self.trees:
            raise ValueError("The forest has not been trained yet.")
        # Collect predictions from each tree
        tree_predictions = []
        for tree in self.trees:
            # Predict with the tree and ensure the result is numeric
            tree_preds = np.array(tree.predict(X))
            tree_predictions.append(tree_preds.astype(int))  # Ensure predictions are integers
        # Majority vote (for each sample)
        tree_predictions = np.array(tree_predictions)
        final_predictions = []
        for i in range(X.shape[0]):
            votes = tree_predictions[:, i]
            majority_vote = np.bincount(votes).argmax()
            final_predictions.append(majority_vote)

        return final_predictions


