# Gradient Boosting Regression
- This notebook contains the code to implement the Gradient Boosting algorithm for Regression from scratch

# Importing Libraries

In [2]:
# data manipulation
import numpy as np
import pandas as pd

# tree
from sklearn.tree import DecisionTreeRegressor

# metrics
from sklearn.metrics import mean_squared_error, r2_score

# dataset
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

## Custom Class

In [10]:
class GBRegressor:

  def __init__(self,
               n_estimators=10,
               learning_rate=1.0,
               random_state=None):
    """
    Parameters:
    -----------

    n_estimators: int
                  The total no. of base estimators to include in the ensemble

    learning_rate: float
                   Used to regularize the model and prevent over-fitting

    random_state: int
                  For controlling the randomness and reproducibility of results across multiple runs
    """
    self.n_estimators = n_estimators
    self.learning_rate = learning_rate
    self.random_state = random_state


  def fit(self, X, y):
    """
    This method will train an AdaBoost classifier model
    """

    if self.random_state:
      np.random.seed(self.random_state)

    n, p = X.shape
    self.base_estimators_ = []
    self.null_model_ = np.mean(y)
    predictions = self.null_model_

    for i in range(self.n_estimators):
      # computing the psuedo-residuals
      residuals = y - predictions

      # training the base-estimator
      base_estimator = DecisionTreeRegressor(max_depth=3)
      self.base_estimator_ = base_estimator.fit(X, residuals)
      self.base_estimators_.append(self.base_estimator_)

      predictions += (self.learning_rate * self.base_estimator_.predict(X))

    print("> Succesfully trained a Gradient Boosting Regressor model\n")
    return self


  def predict(self, X):
    result = np.full(X.shape[0], self.null_model_)
    for tree in self.base_estimators_:
      result += (self.learning_rate * tree.predict(X))
    return result


  def score(self, X, y):
    y_pred = self.predict(X)
    print(f"{'RMSE':>10}: {mean_squared_error(y, y_pred, squared=False)}")
    print(f"{'R-squared':>10}: {r2_score(y, y_pred)}")

# Getting the Data

In [11]:
X, y = fetch_california_housing(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=7)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(16512, 8) (16512,)
(4128, 8) (4128,)


# Training the Model

In [12]:
gbr = GBRegressor(n_estimators=50)
gbr.fit(X_train, y_train)

> Succesfully trained a Gradient Boosting Regressor model



<__main__.GBRegressor at 0x78525c049cf0>

# Evaluating the Model

In [13]:
gbr.score(X_train, y_train)

      RMSE: 0.4525762746022756
 R-squared: 0.8457318112373442


In [15]:
gbr.score(X_test, y_test)

      RMSE: 0.5239203841420708
 R-squared: 0.7959188080933933


- The model shows a good performance on this dataset
- Performance can be improved by tuning the parameters:
 - `n_estimators`
 - `learning_rate`