# Ensemble Learning

## 1. Gradient Boosting Tree

### 1.1. GBT Regressor

In [1]:
# # Load regression dafatset https://archive.ics.uci.edu/dataset/186/wine+quality
# from ucimlrepo import fetch_ucirepo
# dataset = fetch_ucirepo(id=186)
# X = dataset.data.features
# y = dataset.data.targets
# y = y.values.ravel() # flatten to 1D array
# i don't no why but fetch_ucirepo doesn't work with my internet :(

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import pandas as pd

df = pd.read_csv(
    "../data/wine+quality/winequality-red.csv",
    index_col=False,
    encoding="utf-8",
    sep=";",
)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
# TODO: Split into train and test sets

In [5]:
X = df.drop("quality", axis=1)
y = df["quality"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
# TODO: Create the GBT Regressor class

In [8]:
class MyGBTRegressor:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.lr = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_prediction = None

    def fit(self, X, y):
        self.initial_prediction = np.mean(y)
        current_predictions = np.full(len(y), self.initial_prediction)

        for _ in range(self.n_estimators):
            residuals = y - current_predictions

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)

            current_predictions += self.lr * tree.predict(X)
            self.trees.append(tree)

    def predict(self, X):
        y_pred = np.full(len(X), self.initial_prediction)

        for tree in self.trees:
            y_pred += self.lr * tree.predict(X)
        return y_pred

In [9]:
# TODO: Train the GBT Regressor and evaluate its performance

In [10]:
my_gbt = MyGBTRegressor(n_estimators=50, learning_rate=0.1, max_depth=2)

In [11]:
my_gbt.fit(X_train, y_train)

In [12]:
y_pred_my = my_gbt.predict(X_test)

In [13]:
print("--- My GBT Regressor ---")
print(f"MSE: {mean_squared_error(y_test, y_pred_my):.4f}")
print(f"R2: {r2_score(y_test, y_pred_my):.4f}")

--- My GBT Regressor ---
MSE: 0.3896
R2: 0.4038


In [14]:
# TODO: Compare with sklearn's GradientBoostingRegressor

In [15]:
from sklearn.ensemble import GradientBoostingRegressor

In [16]:
# Step 4: Sklearn comparison
sk_gbt = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=2)

In [17]:
sk_gbt.fit(X_train, y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,50
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0


In [18]:
y_pred_sk = sk_gbt.predict(X_test)

In [None]:
print("\n--- Sklearn GBT Regressor ---")
print(f"MSE: {mean_squared_error(y_test, y_pred_sk):.4f}")
print(f"R2: {r2_score(y_test, y_pred_sk):.4f}")


--- Sklearn GBT Regressor ---
MSE: 0.3896
R2: 0.4038


### 1.2. GBT Classifier

In [20]:
# Load classification dataset https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
# dataset = fetch_ucirepo(id=17)
# X = dataset.data.features
# y = dataset.data.targets
# y = y.values.ravel() # flatten to 1D array
# y = (y == 'M').astype(int)  # Convert labels to 0 and 1

In [21]:
df = pd.read_csv("../data/heart_disease_dataset.csv", index_col=False, encoding="utf-8")
df

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,M
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,M
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,M
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,M
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,M
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,M
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,M
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,M


In [22]:
y = df["Diagnosis"].dropna()
X = df.drop("Diagnosis", axis=1)

In [23]:
y

0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: Diagnosis, Length: 569, dtype: object

In [24]:
y = y.values.ravel()

In [None]:
y = (y == "M").astype(int)
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,

In [26]:
# TODO: Split into train and test sets

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [28]:
# TODO: Create the GBT Classifier class

In [29]:
class MyGBTClassifier:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.lr = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_log_odds = None

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def fit(self, X, y):
        # log(p / (1-p))
        p_mean = np.mean(y)
        self.initial_log_odds = np.log(p_mean / (1 - p_mean))

        current_log_odds = np.full(len(y), self.initial_log_odds)

        for _ in range(self.n_estimators):
            probs = self._sigmoid(current_log_odds)

            residuals = y - probs

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)

            current_log_odds += self.lr * tree.predict(X)
            self.trees.append(tree)

    def predict_proba(self, X):
        log_odds = np.full(len(X), self.initial_log_odds)
        for tree in self.trees:
            log_odds += self.lr * tree.predict(X)
        return self._sigmoid(log_odds)

    def predict(self, X):
        return (self.predict_proba(X) > 0.5).astype(int)

In [30]:
# TODO: Train the GBT Classifier and evaluate its performance

In [None]:
my_gbc = MyGBTClassifier(n_estimators=50, learning_rate=0.1, max_depth=3)

In [32]:
my_gbc.fit(X_train, y_train)

In [33]:
y_pred_my = my_gbc.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score, classification_report

In [35]:
print("--- My GBT Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_my):.4f}")

--- My GBT Classifier ---
Accuracy: 0.9561


In [36]:
# TODO: Compare with sklearn's GradientBoostingRegressor

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
sk_gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3)

In [39]:
sk_gbc.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,50
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [40]:
y_pred_sk = sk_gbc.predict(X_test)

In [41]:
print("\n--- Sklearn GBT Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_sk):.4f}")


--- Sklearn GBT Classifier ---
Accuracy: 0.9561


## 2. XGBoost

In [42]:
# TODO: Create XGBoost Regressor and Classifier models and compare their performance with your implementations

### regressor

In [None]:
import xgboost as xgb

In [None]:
xgb_reg = xgb.XGBRegressor(
    n_estimators=50, max_depth=2, learning_rate=0.1, random_state=42
)

In [None]:
xgb_reg.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
y_pred_xgb = xgb_reg.predict(X_test)

In [None]:
print("--- XGBoost Regressor ---")
print(f"MSE: {mean_squared_error(y_test, y_pred_xgb):.4f}")
print(f"R2: {r2_score(y_test, y_pred_xgb):.4f}")

--- XGBoost Regressor ---
MSE: 0.0373
R2: 0.8410


### classifier

In [None]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42
)

In [None]:
xgb_clf.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
y_pred_xgb_clf = xgb_clf.predict(X_test)

In [51]:
print("--- XGBoost Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb_clf):.4f}")

--- XGBoost Classifier ---
Accuracy: 0.9561
