# Gradient Boost

## Thuật toán
- Hàm loss tại 1 điểm:  $ L(y_i, F(x_i)) = (y_i - F(x_i))^2 $

### Bước 1: Khởi tạo mô hình
- Khởi tạo giá trị dự đoán $F_0(x) = \arg \min_{F(x)} \sum_{i = 1}^N L(y_i, F(x)) =\frac{1}{N} \sum_{i = 1}^N y_i$

### Bước 2: Xây dựng M mô hình: 
- Tính sai số huấn luyện cho mỗi điểm dữ liệu: 
$$ r_i^{(m)} = - \frac{\partial L(y_i, F_{m-1}(x_i))}{\partial F_{m - 1}(x_i)} $$
- Huấn luyện các mô hình yếu cho tập các điểm dữ liệu $(x_i, r_i^{(m)})$: Gọi mô hình là $h_m(x)$
- Tính hệ số học tối ưu thỏa hàm loss đạt giá tri cực tiểu:
$$
\gamma_m = \arg\min_{\gamma} \sum_{i=1}^{n} L(y_i, F_{m-1}(x_i) + \gamma h_m(x_i))
$$

- Cập nhật giá trị: 
$$
F_m(x) = F_{m-1}(x) + \gamma_m h_m(x)
$$
 



In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

In [4]:
class GradientBoost():
    def __init__(self, max_depth=8, min_samples_split=5, min_samples_leaf=5, max_features=3, lr=0.1, num_iter=50):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.lr = lr
        self.num_iter = num_iter
        self.y_mean = 0
        self.list_tree = []

    def __calculate_loss(self,y, y_pred):
        loss = (1/len(y)) * 0.5 * np.sum(np.square(y-y_pred))
        return loss

    def __take_gradient(self, y, y_pred):
        # grad = -(y-y_pred) # cho dự đoán = pred - alpha * residual
        grad = (y - y_pred)  # cho dự đoán = pred + alpha * residual
        return grad

    def __create_base_model(self, X, y):
        base = DecisionTreeRegressor(max_depth=self.max_depth,
                                    min_samples_split=self.min_samples_split,
                                    min_samples_leaf=self.min_samples_leaf,
                                    max_features=self.max_features)
        base.fit(X,y)
        return base

    def predict(self,X):
        pred_0 = np.array([self.y_mean] * len(X))
        pred = pred_0.reshape(len(pred_0),1)

        for i in range(len(self.list_tree)):
            temp = (self.list_tree[i].predict(X)).reshape(len(X),1)
            # pred -= self.lr * temp #cho dự đoán = pred - alpha * residual
            pred += self.lr * temp #cho dự đoán = pred + alpha * residual

        return pred

    def fit(self, X, y):
        losses = []
        self.y_mean = np.mean(y)
        pred_0 = np.array([np.mean(y)] * len(y))
        pred = pred_0.reshape(len(pred_0),1)

        for epoch in range(self.num_iter):
            loss = self.__calculate_loss(y, pred)
            # print("loss: epoch", epoch, "=", loss)
            losses.append(loss)
            grads = self.__take_gradient(y, pred)
            
            base = self.__create_base_model(X, grads)
            h_m = (base.predict(X)).reshape(len(X),1)
            
            pred += self.lr * h_m
            self.list_tree.append(base)


In [12]:

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#READ DATA
data = pd.read_csv("advertising.csv")
data.fillna(0,inplace=True)

data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [None]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=100)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
y_train = np.array(y_train).reshape(X_train.shape[0],1)
y_test = np.array(y_test).reshape(X_test.shape[0],1)

G = GradientBoost()
G.fit(X_train,y_train)

In [13]:
pred = G.predict(X_test)
print(mean_squared_error(pred, y_test))
r2_score(pred, y_test)

3.3110071248738846


0.8935842037894236

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor(max_depth=8, min_samples_split=5, min_samples_leaf=5, max_features=3, learning_rate=0.1, n_estimators=50)
GB.fit(X_train, y_train)
pred = GB.predict(X_test)
print(mean_squared_error(pred, y_test))
r2_score(pred, y_test)

3.310103704436709


  y = column_or_1d(y, warn=True)


0.8936078533697203