# Table of Contents
1. [Imports and config](#Imports-and-Config)
2. [Util functions](#Util-functions)
3. [Data prep](#Data-prep)
4. [GD with stumps](#Gradient-boosting-with-stumps)
5. [Results](#Results)

**Note:**
References may not work in Google Colab

# Imports and Config

In [1]:
import os
import kagglehub
from tqdm import tqdm

import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, \
    mean_absolute_error, mean_squared_error

from scipy.optimize import minimize_scalar

In [2]:
N_ESTIMATORS = 1000
MAX_DEPTH = 1
LR = 0.1

# Util functions

In [3]:
def eval_regression(y_true, y_pred, dataset=None):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    res = {"r2": r2, "mae": mae, "mse": mse}

    if dataset:
        res["dataset"] = dataset

    return res

def estimate_generalization(eval_train, eval_test):
    eval_all = pd.DataFrame([eval_train, eval_test]).set_index("dataset").T
    eval_all["% worse"] = abs(eval_all["train"] - eval_all["test"]) / eval_all["train"] * 100

    eval_all.rename({"dataset":"metric"}, inplace=True)
    return eval_all

# Data prep

In [4]:
path = kagglehub.dataset_download("kumarajarshi/life-expectancy-who")

df = pd.read_csv(os.path.join(path, "Life Expectancy Data.csv"))

Downloading from https://www.kaggle.com/api/v1/datasets/download/kumarajarshi/life-expectancy-who?dataset_version_number=1...


100%|██████████| 119k/119k [00:00<00:00, 38.3MB/s]

Extracting files...





In [5]:
COLS_TO_KEEP = ["Year", "Life expectancy ", "Adult Mortality", " BMI "]
df = df[COLS_TO_KEEP]

df.rename(columns={"Life expectancy ": "Life_expectancy",
                     " BMI ": "BMI"}, inplace=True)
df.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"Life expectancy ": "Life_expectancy",
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


## Train test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["Life_expectancy"]),
    df["Life_expectancy"],
    test_size=0.2,
    random_state=509
)

# Gradient boosting with stumps

In [7]:
class GradientBoostingWithStumps:
    def __init__(self, max_depth=1, n_estimators=100,
                 learning_rate=0.1, line_search=False,
                 random_state=509):
        self.max_depth = max_depth
        self.n_estimators = n_estimators

        self.learning_rate = learning_rate
        self.line_search = line_search

        self.random_state = random_state

        self.models = []
        self.results = []

        if self.line_search:
            self.alphas = []

    def get_best_alpha(self, f_m_minus_1, residual):
        def obj(lr):
            y_pred_new = f_m_minus_1 + lr * residual
            return mean_squared_error(self.y, y_pred_new)

        res = minimize_scalar(obj, bounds=(-5, 5), method="bounded")
        return res.x

    def fit(self, X, y):
        self.y = y

        y_pred = np.full(y.shape, y.mean())
        for i in tqdm(range(self.n_estimators)):
            residuals = y - y_pred

            tree = DecisionTreeRegressor(max_depth=self.max_depth,
                                         random_state=self.random_state)
            tree.fit(X, residuals)

            residual_pred = tree.predict(X)

            if self.line_search:
                lr_to_use = self.get_best_alpha(y_pred, residual_pred)
                self.alphas.append(lr_to_use)
            else:
                lr_to_use = self.learning_rate

            y_pred += lr_to_use * residual_pred

            eval_res = eval_regression(y, y_pred)
            eval_res["iter"] = i
            self.results.append(eval_res)
            self.models.append(tree)

    def predict(self, X):
        y_pred = np.full(X.shape[0], self.y.mean())
        for i in range(self.n_estimators):
            if self.line_search:
                lr = self.alphas[i]
            else:
                lr = self.learning_rate

            y_pred += lr * self.models[i].predict(X)

        return y_pred

In [9]:
def do_all(use_line_search):
    gb = GradientBoostingWithStumps(max_depth=MAX_DEPTH,
                                    n_estimators=N_ESTIMATORS,
                                    learning_rate=LR,
                                    line_search=use_line_search)
    gb.fit(X_train, y_train)

    y_pred_train = gb.predict(X_train)
    y_pred_test = gb.predict(X_test)

    eval_train = eval_regression(y_train, y_pred_train, "train")
    eval_test = eval_regression(y_test, y_pred_test, "test")

    eval_all = estimate_generalization(eval_train, eval_test)


    return gb, eval_all

- ls = line_search

In [10]:
gb_ls, eval_all_ls = do_all(use_line_search=True)
gb_no_ls, eval_all_no_ls = do_all(use_line_search=False)

100%|██████████| 1000/1000 [00:09<00:00, 106.54it/s]
100%|██████████| 1000/1000 [00:05<00:00, 172.84it/s]


# Results

In [None]:
merged_eval_all = pd.concat([eval_all_ls, eval_all_no_ls],
                            keys=['Line Search', 'No Line Search'],
                            names=['Method', 'Metric'])


# barplot of the % worse
fig = px.bar(merged_eval_all.reset_index(), x='Metric', y='% worse',
             color='Method', barmode='group',
             title='% worsened with and without line search')


fig.show()
# save image
# fig.write_image("line_search_vs_no_line_search.png")


In [11]:
gb_no_ls_res = pd.DataFrame(gb_no_ls.results)
gb_ls_res = pd.DataFrame(gb_ls.results)

# merge the results
gb_res = pd.concat([gb_no_ls_res, gb_ls_res], keys=['No Line Search', 'Line Search'], names=['Method'])
gb_res = gb_res.reset_index().drop("level_1", axis=1)


In [12]:
gb_res_melted = gb_res.melt(id_vars=['Method', 'iter'],
                            value_vars=['r2', 'mae', 'mse'],
                            var_name='metric', value_name='value')

# plot the metrics for each method
fig = px.line(gb_res_melted, x='iter', y='value', color='Method',
              facet_col='metric', facet_col_spacing=0.1)

# make each facet have its own y-axis
for axis in fig.layout:
    if axis.startswith('yaxis'):
        fig.layout[axis].update(matches=None)

fig.update_layout(title_text="Metrics over iterations with and without line search")
fig.show()