In [6]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor as SKGB

# =========================
# Decision Tree Regressor
# =========================
class DecisionTreeRegressorScratch:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([self._predict_row(x, self.tree) for x in X])

    def _mse(self, y):
        return np.var(y) * len(y)

    def _best_split(self, X, y):
        best_feature, best_thresh = None, None
        best_error = float("inf")

        n_samples, n_features = X.shape

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for thresh in thresholds:
                left_mask = X[:, feature] <= thresh
                right_mask = X[:, feature] > thresh

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                left_y = y[left_mask]
                right_y = y[right_mask]

                error = self._mse(left_y) + self._mse(right_y)

                if error < best_error:
                    best_error = error
                    best_feature = feature
                    best_thresh = thresh

        return best_feature, best_thresh

    def _build_tree(self, X, y, depth):
        if depth >= self.max_depth or len(y) < self.min_samples_split:
            return np.mean(y)

        feature, thresh = self._best_split(X, y)

        if feature is None:
            return np.mean(y)

        left_mask = X[:, feature] <= thresh
        right_mask = X[:, feature] > thresh

        left_sub = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_sub = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return (feature, thresh, left_sub, right_sub)

    def _predict_row(self, x, node):
        if not isinstance(node, tuple):
            return node

        feature, thresh, left, right = node

        if x[feature] <= thresh:
            return self._predict_row(x, left)
        else:
            return self._predict_row(x, right)

# =========================
# Gradient Boosting Regressor
# =========================
class GradientBoostingRegressorScratch:
    def __init__(self, n_estimators=50, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.init_prediction = None

    def fit(self, X, y):
        # initial prediction = mean
        self.init_prediction = np.mean(y)
        y_pred = np.full(len(y), self.init_prediction)

        for _ in range(self.n_estimators):
            residual = y - y_pred  # negative gradient of MSE

            tree = DecisionTreeRegressorScratch(max_depth=self.max_depth)
            tree.fit(X, residual)

            update = tree.predict(X)
            y_pred += self.learning_rate * update

            self.trees.append(tree)

    def predict(self, X):
        y_pred = np.full(X.shape[0], self.init_prediction)

        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)

        return y_pred

# =========================
# Synthetic Dataset
# =========================
np.random.seed(42)
X = np.random.rand(500, 1) * 10
y = np.sin(X).ravel() + np.random.normal(0, 0.3, 500)

# =========================
# Train Custom GBM
# =========================
custom_model = GradientBoostingRegressorScratch(
    n_estimators=60,
    learning_rate=0.1,
    max_depth=3
)

custom_model.fit(X, y)
pred_custom = custom_model.predict(X)

# =========================
# Train sklearn GBM
# =========================
sk_model = SKGB()
sk_model.fit(X, y)
pred_sk = sk_model.predict(X)


print("=== RESULTS ===")
print("Custom GBM MSE:", mean_squared_error(y, pred_custom))
print("Sklearn GBM MSE:", mean_squared_error(y, pred_sk))
print("Trees built:", len(custom_model.trees))
print("Project: SUCCESS RUN")


=== RESULTS ===
Custom GBM MSE: 0.06663056430334145
Sklearn GBM MSE: 0.05896065028915962
Trees built: 60
Project: SUCCESS RUN
