1. scratch

In [6]:
import numpy as np
from collections import Counter
from copy import deepcopy

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(np.array(X), np.array(y), depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        # print(X.shape)
        unique_labels = np.unique(y)

        # Stopping conditions
        if len(unique_labels) == 1:
            return unique_labels[0]
        if self.max_depth is not None and depth >= self.max_depth:
            return Counter(y).most_common(1)[0][0]
        if num_samples == 0:
            return None

        # Find best split
        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            return Counter(y).most_common(1)[0][0]

        # Split dataset
        left_mask = X[:, best_feature] <= best_threshold
        right_mask = X[:, best_feature] > best_threshold

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            "feature": best_feature,
            "threshold": best_threshold,
            "left": left_subtree,
            "right": right_subtree
        }

    def _best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None
        current_entropy = self._entropy(y)

        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_mask = X[:, feature_index] <= threshold
                right_mask = X[:, feature_index] > threshold
                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue

                gain = self._information_gain(y, y[left_mask], y[right_mask], current_entropy)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def _entropy(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probs if p > 0])

    def _information_gain(self, parent, left, right, parent_entropy):
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        return parent_entropy - (weight_left * self._entropy(left) + weight_right * self._entropy(right))

    def predict(self, X):
        return [self._predict_point(x, self.tree) for x in np.array(X)]

    def _predict_point(self, x, tree):
        if not isinstance(tree, dict):
            return tree

        feature = tree["feature"]
        # print('f', feature)
        threshold = tree["threshold"]
        # print('t', threshold)

        if x[feature] <= threshold:
            return self._predict_point(x, tree["left"])
        else:
            return self._predict_point(x, tree["right"])



class RandomForest:
    def __init__(self, n_estimators=10, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)

        for _ in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            # Train a Decision Tree
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Gather predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Transpose to shape (num_samples, num_trees)
        tree_preds = tree_preds.T

        # Majority vote
        for row in tree_preds:
            print(Counter(row))
            print(Counter(row).most_common(1))

        final_preds = [Counter(row).most_common(1)[0][0] for row in tree_preds]
        return final_preds

# Load dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train Random Forest
rf = RandomForest(n_estimators=5, max_depth=3)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))


Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(2): 5})
[(np.int64(2), 5)]
Counter({np.int64(1): 4, np.int64(2): 1})
[(np.int64(1), 4)]
Counter({np.int64(2): 5})
[(np.int64(2), 5)]
Counter({np.int64(0): 4, np.int64(1): 1})
[(np.int64(0), 4)]
Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(2): 5})
[(np.int64(2), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(2): 5})
[(np.int64(2), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(0): 5})
[(np.int64(0), 5)]
Counter({np.int64(1): 5})
[(np.int64(1), 5)]
Counter({np.int64(1): 5

2. sklearn

In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 加载数据
iris = load_iris()
X, y = iris.data, iris.target

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# 初始化并训练 Random Forest 分类器
clf = RandomForestClassifier(n_estimators=5, max_depth=3, random_state=1)
clf.fit(X_train, y_train)

# 预测
y_pred = clf.predict(X_test)

# 输出准确率
print("Sklearn Random Forest Accuracy:", accuracy_score(y_test, y_pred))
