# Seminar 9

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

## First ensemble

Data similar from the data from the lecture

In [None]:
np.random.seed(0)

cloud1 = np.random.multivariate_normal(mean=[  0.0, 0.0], cov=[[1.0, 0.0], [0.0, 0.5]], size=50)
cloud2 = np.random.multivariate_normal(mean=[-15.0, 0.0], cov=[[1.0, 0.0], [0.0, 0.5]], size=50)
cloud3 = np.random.multivariate_normal(mean=[ 15.0, 0.0], cov=[[1.0, 0.0], [0.0, 0.5]], size=50)


data = np.concatenate([cloud1, cloud2, cloud3])
data.shape

Target (class)

In [None]:
y = -np.ones(len(data), dtype='int32')
y[:50] = 1

In [None]:
y

Let's draw a scatterplot

In [None]:
plt.scatter(data[:,0], data[:,1],c=y)

#Setting limits for x2
plt.ylim([-6, 6])

The data is not linearly separated - impossible to draw a straight line and separate yellow class from purple

Let's try to build a model, similar from the one from the lecture

### Model 1

Let's try to separate right cloud from the middle and left one using linear model:

if $x_1w_1 + w_2x_2 + w_0 > 0$ - class 1

if $x_1w_1 + w_2x_2 + w_0 < 0$ - class -1

In [None]:
#Need to change weights to find the correct model
w_0 = 1.0
w_1 = 1.0
w_2 = 1.0

Depict the model:

In [None]:
#generate x1
x_1 = np.arange(data[:,0].min(), data[:,0].max()+0.2, 0.05)
#calculate x2
x_2_model_1 = -(w_0 + x_1 * w_1) / w_2
plt.plot(x_1, x_2_model_1, label='Model 1')


plt.scatter(data[:,0], data[:,1],c=y)
plt.ylim([-6, 6])
plt.legend()

Let's check quality:

In [None]:
linear_combination1 = w_1 * data[:,0] + w_2 * data[:,1] + w_0
linear_combination1

In [None]:
prediction1 = np.sign(linear_combination1).astype('int')
prediction1

Let's check the quality (using accuracy)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y, prediction1)

The accuracy is not ideal. Let's check, if our model is better than classifierthat predicts the most popular class.

In [None]:
#Looking at the classes in y and the number of samples
np.unique(y, return_counts=True)

In [None]:
#-1 is the modt popular class
#predicting always -1 and computing accuracy
accuracy_score(y, -np.ones(len(y)))

The accuracy is the same. So, our model was not so great.

### Model 2

Now. let's conduct the same experiment with the left cloud (separate it from the rest)

In [None]:
#Need to change weights to find the correct model
v_0 = 1.0
v_1 = 1.0
v_2 = 1.0

In [None]:
#generate x1
x_1 = np.arange(data[:,0].min(), data[:,0].max()+0.2, 0.05)
#calculate x2
x_2_model_2 = -(v_0 + x_1 * v_1) / v_2
plt.plot(x_1, x_2_model_2, label='Model 1')


plt.scatter(data[:,0], data[:,1],c=y)
plt.ylim([-6, 6])
plt.legend()

Let's use the model and measure accuracy:

In [None]:
linear_combination2 = v_1 * data[:,0] + v_2 * data[:,1] + v_0
prediction2 = np.sign(linear_combination2).astype('int')
prediction2

In [None]:
accuracy_score(y, prediction2)

We got same results

### Model 1 and Model 2

Now, let's understand, where the model make mistakes and how to correct them

In [None]:
# sep - how the words should be separated
print('Target', 'Model 1', 'Model 2', sep='\t')

#Choosing every fifth element
for i in range(0, len(data), 5):
  print(y[i], prediction1[i], prediction2[i], sep='\t')

We can conclude:



*   If both models predict 1, we should predict 1
*   If models contradict, we should predict -1
*   Models do not predict -1 together

How can we get the best possible prediction based on the prediction of both models? - For example, we can multiply them (1 * 1 = 1, -1 * 1 = -1)



Now, we can build an ensemble - we know how to merge models:

In [None]:
def ensemble_model_prediction(data):
  #Copy-paste the weights from the experiments
  w_0 = 1.0
  w_1 = 1.0
  w_2 = 1.0

  v_0 = 1.0
  v_1 = 1.0
  v_2 = 1.0

  #Model 1
  linear_combination1 = w_1 * data[:,0] + w_2 * data[:,1] + w_0
  prediction1 = np.sign(linear_combination1).astype('int')

  #Model 2
  linear_combination2 = v_1 * data[:,0] + v_2 * data[:,1] + v_0
  prediction2 = np.sign(linear_combination2).astype('int')

  #ensemble prediction
  prediction = prediction1 * prediction2

  return prediction

Let's test it

In [None]:
prediction = ensemble_model_prediction(data)
accuracy_score(y, prediction)

Now, the quality is ideal

We may want to depict regions using function, but we would need to add some code, because the function is designed to work with models that return 0/1 as predictions

In [None]:
from mlxtend.plotting import plot_decision_regions

class SimpleEnsemble:
  def predict(self, X):
    return (1 + ensemble_model_prediction(X)) // 2
plot_decision_regions(data, y, SimpleEnsemble())

## Voting classifier

In previous task we could multiply predictions, but this is not a common tactic (most likely, it will not work another dataset).

What may work? - voting

Idea: build a lot of different classifiers (Logistic regression, decision tree, ...) and choose the most popular answer anong them

Let's work with Iris again

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


data = load_iris()
X = pd.DataFrame(data['data'], columns = data['feature_names'])
y = data['target'].copy()
y[y!=1] = 0
#X = X[['sepal length (cm)', 'sepal width (cm)']]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

Let's build several classification models (see [sklearn documentation](https://scikit-learn.org/1.5/supervised_learning.html) to find out new classifiaction models)

All the model from sklearn has `fit` and `predict` methods, so it is easy to work with new models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

print(accuracy_score(y_test,  knn.predict(X_test)))


params = {'n_neighbors' : [1, 3, 5, 10, 20, 25, 30, 50]}

knn = GridSearchCV(KNeighborsClassifier(),
                   params, cv=3,
                   scoring='accuracy'
                   )
knn.fit(X_train, y_train)

In [None]:
knn.best_score_

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier


models = [
    LogisticRegression(),
    DecisionTreeClassifier(max_depth=4),
    KNeighborsClassifier(n_neighbors=5),

]

titles = ['Logistic Regression', 'Decision Tree', 'KNN']

#fit and test every model
for model, title in zip(models, titles):
  model.fit(X_train, y_train)
  print(accuracy_score(y_test, model.predict(X_test)), title)

Now, the models can be used for voting

There are 2 types of voting:


*   Hard voting - predict the class with majority of votes (if 3 classifier predicted class 1 and one will predict class 0, the result will be class 1)
*   Soft voting - preobaibilities, predicted by classifiears are used (if probabilities for class 1 were [0.8, 0.7, 0.9, 0.1] and for class 0 - [0.2, 0.3, 0.1, 0.9], the probability for class 1 will be 0.675, for class 0 = 0.375)



Hard voting:

In [None]:
def hard_voting(predictions):
  #predictions - list of lists predictions from every model for the sample

  pred = []
  for prediction_sample in predictions:
    #for every class, we know how many models predicted it
    classes, votes = np.unique(prediction_sample, return_counts=True)

    pred.append(classes[votes.argmax()])
  return np.array(pred)

Example:

In [None]:
hard_voting([[1, 1, 2, 1, 2, 3],
             [2, 2, 2, 2, 2, 2],
             [2, 1, 1, 1, 1, 3]])

Let's apply to our data

In [None]:
predictions = np.array([model.predict(X_test) for model in models]).T

predictions.shape

In [None]:
predictions_ensemble = hard_voting(predictions)

In [None]:
accuracy_score(y_test, predictions_ensemble)

Soft voting:

Example:

In [None]:
X_test.iloc[0:1]

In [None]:
models[0].predict_proba(X_test.iloc[0:1])

In [None]:
models[1].predict_proba(X_test.iloc[0:1])

In [None]:
models[2].predict_proba(X_test.iloc[0:1])

So, lets compute mean probability of class 0:

In [None]:
(0.72298819 + 1. + 1.) / 3

And for class 1:


In [None]:
(0.27701181 + 0. + 0.) / 3

So, the mean probability of class 0 if $~0.9$ and for class 1 is $~0.1$, the sample will be classified as class 0.

In [None]:
def soft_voting(samples, models):
  pred = []
  #Let's investigate every sample
  for i in range(samples.shape[0]):
    proba_sample = 0
    #Use every model for the sample
    for model in models:
      proba_model = model.predict_proba(samples.iloc[i:i+1])
      proba_sample += proba_model
    #Get new probabilities
    proba_sample /= proba_sample.sum()
    #Choose class
    pred.append(np.argmax(proba_sample))
  return np.array(pred)

In [None]:
soft_voting(X_test.iloc[0:1], models)

Let's apply to our data

In [None]:
predictions_ensemble = soft_voting(X_test, models)
predictions_ensemble

In [None]:
accuracy_score(y_test, predictions_ensemble)

We implemented the voting system ourselves, but we also can use the `VotingClassifier` from sklearn:

In [None]:
from sklearn.ensemble import VotingClassifier

models_names = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier(max_depth=4)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),

]

#hard voting
voting = VotingClassifier(models_names,
                          voting='hard'
                          )

voting.fit(X_train, y_train)

accuracy_score(y_test, voting.predict(X_test))

In [None]:
#soft voting
voting = VotingClassifier(models_names,
                          voting='soft'
                          )

voting.fit(X_train, y_train)

accuracy_score(y_test, voting.predict(X_test))

## Bagging

Idea: generate new samples from our using bootstrap (samples are close enough, so the models wil extract similar real dependencies, but different enough to overfitted models to differ)

Let's work with some bigger dataset for regression

In [None]:
from sklearn.datasets import fetch_california_housing

california = fetch_california_housing()
california_X = pd.DataFrame(data=california.data, columns=california.feature_names)
california_Y = california.target
print(f"X shape: {california_X.shape}, Y shape: {california_Y.shape}")
X_train, X_test, y_train, y_test = train_test_split(
    california_X, california_Y, test_size=0.3, random_state=123, shuffle=True
)

Let's fit a decision tree

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)


print(f"MSE on train set: {mean_squared_error(y_train, tree.predict(X_train)):.2f}")
print(f"MSE on test set: {mean_squared_error(y_test, tree.predict(X_test)):.2f}")

MSE on train data is 0.0, but on test data is larger. That seems like overfitting. (Decision tree without constraints can reach ideal quality on non-contradicting data.

Let's try to use bagging. First of all, let's implement bootstrap.

### Bootstrap

Idea: from our data choose random elements and get new sample with the same number of samples

In [None]:
example = np.array([1,2,3,4,5])

Choosing random indices. We need to choose from 0, 1, ..., ‘len(example)-1‘, we allow method to choose same element seceral times, so 'replace=True'.

In [None]:
#generate indices of elems
idx = np.random.choice(len(example), replace=True, size=len(example))
idx

Now let's choose the elements of the initial array

In [None]:
example[idx]

In [None]:
def bootstrap(samples):
  idx = np.random.choice(len(samples), replace=True, size=len(samples))
  return samples[idx].copy()

In [None]:
bootstrap(np.array([1,2,3,4,5]))

In [None]:
def generate_bootstrap(samples, N):
  bootstrap_samples = []
  for i in range(N):
    bootstrap_samples.append(bootstrap(samples))
  return np.array(bootstrap_samples)

In [None]:
generate_bootstrap(np.array([1,2,3,4,5]), 3)

Let's generate several bootstrap samples and investigate features.

In [None]:
N = 8

#We need to merge X and y, because the same bootstrap should be applied to both X and y
X_y_train = X_train.copy()
X_y_train['y'] = y_train.copy()

bootstrap_samples = generate_bootstrap(X_y_train.values, N)
bootstrap_samples.shape

In [None]:
fig, axs = plt.subplots(ncols=N//2, nrows=2, figsize=(20,10), sharex=True, sharey=True)

feature = 2
for i in range(N):
  axs[i//4][i%4].scatter(bootstrap_samples[i][:,feature], bootstrap_samples[i][:,-1])

fig.suptitle('Feature ' + X_y_train.columns[feature] + ' and target', fontsize=16)
plt.show();

### Use bagging

Now, let's fit separate decision tree with every sample we have

In [None]:
models = []

for sample in bootstrap_samples:
  tree = DecisionTreeRegressor()
  tree.fit(sample[:,:-1], sample[:,-1])
  models.append(tree)

How does every tree perform?

In [None]:
for tree in models:
  print(f"MSE on test set: {mean_squared_error(y_test, tree.predict(X_test.values)):.2f}")

Now, let's take mean prediction for every sample

In [None]:
preds_trees = []
for tree in models:
  preds_trees.append(tree.predict(X_test.values))
preds_trees = np.array(preds_trees)

preds_trees.shape

In [None]:
print(f"MSE on test set: {mean_squared_error(y_test, np.mean(preds_trees, axis=0)):.2f}")

The MSE on test  decreased! The bagging seems to be working

Let's now use the `BaggingRegressor` from sklearn. It will perform bootstrap, fit several models and take mean as prediction

In [None]:
from sklearn.ensemble import BaggingRegressor

base_tree = DecisionTreeRegressor()

#we will have 8 trees
bag = BaggingRegressor(base_tree,
                       n_estimators=8
                       )
bag.fit(X_train, y_train)


print(f"MSE on train set: {mean_squared_error(y_train, bag.predict(X_train)):.2f}")
print(f"MSE on train set: {mean_squared_error(y_test, bag.predict(X_test)):.2f}")

MSE on train set increases, because not every tree is fitted to predict every sample, so particular overfitted trees may be wrong on train data. MSE on test set is better.

### How many models should we take?

Let's fit bagging several times with different number of models

In [None]:
n_trees = range(1, 100, 4)
train_loss = []
test_loss = []

for i in n_trees:
    print(i)
    bagging = BaggingRegressor(base_tree,
                               n_estimators=i,
                               n_jobs=4 # this will speed up training a bit

                       )
    bagging.fit(X_train, y_train)
    train_loss.append(mean_squared_error(y_train, bagging.predict(X_train)))
    test_loss.append(mean_squared_error(y_test, bagging.predict(X_test)))

plt.figure(figsize=(10, 7))
plt.title("Dependency of MSE and number of models for Bagging")
plt.grid()
plt.plot(n_trees, train_loss, label="MSE_train")
plt.plot(n_trees, test_loss, label="MSE_test")
plt.ylabel("MSE")
plt.xlabel("Number of models")
plt.legend();

In [None]:
min(test_loss)

When we had only 1 tree - MSE was big, we started adding trees and the error begin decreasing. However, after having ~20 trees it stopped decreasing and became almost constant.

Why? in the beginning every new tree brings useful information, new predictions, but after we added a lot of trees, next one will not be useful, it will be repeating the answers of previous one and the impact will be smaller.

## Random Forest

Bagging can be used with any base model (linear regression, decision tree).

Using trees is the most popular variant (they can learn non-linear dependencies, but easily overfit). It is possible to apply additional modification to make trees mode diverse - limit the number of features that can be used while spliting nodes in trees. Trees will still be able to discover complex dependencies, but they will significantly differ from each

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=8, n_jobs=4)
rf.fit(X_train, y_train)

print(f"MSE on train set: {mean_squared_error(y_train, rf.predict(X_train)):.2f}")
print(f"MSE on train set: {mean_squared_error(y_test, rf.predict(X_test)):.2f}")

The results are similar to the ones of bagging. Let's investigate number of trees:

In [None]:
n_trees = range(1, 100, 4)
train_loss = []
test_loss = []

for i in n_trees:
    print(i)
    rf = RandomForestRegressor(n_estimators=i, n_jobs=4)
    rf.fit(X_train, y_train)
    train_loss.append(mean_squared_error(y_train, rf.predict(X_train)))
    test_loss.append(mean_squared_error(y_test, rf.predict(X_test)))

plt.figure(figsize=(10, 7))
plt.title("Dependency of MSE and number of trees for Random Forest")
plt.grid()
plt.plot(n_trees, train_loss, label="MSE_train")
plt.plot(n_trees, test_loss, label="MSE_test")
plt.ylabel("MSE")
plt.xlabel("Number of trees")
plt.legend();

In [None]:
min(test_loss)

## Boosting

Idea: we fitted one model and it made sone mistekes, let's teach the next model to fix them.

Example: the item costed 15 rub., model 1 predicted that the item will cost 25rub.. So the  model 2 should try to predict 15rub. - 25rub. = -10rub..

Predictions of model 2 cannot be used without predictions of model 1 (item cannot cost -10rub.), so, we will need to sum the predictions to get the final prediction: 25rub. + (-10rub.) = 15rub.


$$a(x) = \sum_{n=1}^N b_n(x)$$

$a(x)$ - the prediction of the ensemble

$b_n(x)$ - the prediction of the $n$-th base model

Model 1 and 2 (base models) are usually trees, but significantly underfitted (not deep, not many leaves, a lot of samples in each leaf). That prevents the overfitting of boosting (if an overfitted model will be fixing the mistakes of overfitted model, the ensemble will be significantly overfitted)

### First boosting

Let's investigate this idea using synthetic data:

In [None]:
#Generation of samples
np.random.seed(123)
N = 100
X = np.linspace(0, 1, N).reshape(-1, 1)
y = np.sin(X)[:, 0] + np.random.normal(0, 0.1, size=N)


#Function for visualization
def plot_sample_model(
    X, y, plot_predictions=False, y_pred=None, y_pred_label=None
):
    plt.figure(figsize=(10, 7))
    plt.scatter(X, y, label="Train", alpha=0.7)
    if plot_predictions:
        plt.plot(X, y_pred, label=y_pred_label, c="r")
        plt.title("MSE: " + str(mean_squared_error(y, y_pred)))

    plt.xlabel("X")
    plt.ylabel("y")
    plt.legend()


plot_sample_model(X, y, plot_predictions=False)

*Step 0.* At the beginnig, we have no base models, the ensemble is empty.

In [None]:
#The prediction of the ensemble
a = 0

*Step 1* Let's fit first model $b_1(x)$. Let's use decision trees of depth 1 (decision stump)

In [None]:
from sklearn.tree import DecisionTreeRegressor


b = DecisionTreeRegressor(max_depth=1).fit(X, y)


a = b.predict(X)


plot_sample_model(
    X, y, plot_predictions=True, y_pred=a, y_pred_label="a = b_1"
)

*Step 2* Let's comute the mistakes (residuals) of the ensemble

In [None]:
s = y - a

*Step 3* Fit next model ($b_2(x)$). This model tries to predict the residuals. The ensemble will be $a(x) = b_1(x) + b_2(x)$

In [None]:
b = DecisionTreeRegressor(max_depth=1).fit(X, s)

#Add the prediction of the model to the ensemble
a += b.predict(X)

plot_sample_model(
    X, y, plot_predictions=True, y_pred=a, y_pred_label="a = b_1 + b_2"
)

The MSE is better and the model is better

*Step 4 - ...* Repeat Steps 2,3 (compute residuals, fit next model, add the new predictions)

In [None]:
s = y - a

b = DecisionTreeRegressor(max_depth=1).fit(X, s)

#Add the prediction of the model to the ensemble
a += b.predict(X)

plot_sample_model(
    X, y, plot_predictions=True, y_pred=a, y_pred_label="a = b_1 + b_2 + b_3"
)

More steps - the more complex model, the more precise predictions on the train data.

And what will happened on test data? - Most likely, firstly the MSE will decrease (we will be fixing serious mistakes), but after a lot of models we will be overfitting (fixing minor mistakes on train data) and error on test will be increaing.

So, the boosting may overfit if we have a lot of models (random forest did not overfit)

### Boosting for various models

Why do we use only trees? Why not linear regression?

In [None]:
#This function will help us visualize the process of fitting boosting
#(we do not want to copy-paste code again)
def plot_boosting_results(b, n_estimators, X, y):
    fig, ax = plt.subplots(n_estimators, 3, figsize=(20, n_estimators * 5))

    #Residuals
    resid = []
    resid.append(y)

    #Model predictions
    y_pred = []

    for i in range(n_estimators):
        #Training base model
        b.fit(X, resid[-1])

        #Predicting with base models
        y_pred.append(b.predict(X))

        #Predicting using ensemble
        a = np.sum(y_pred, axis=0)

        #Compute residual
        resid.append(y - a)

        #Prediction of the ensemble (a)
        ax[i, 0].scatter(X, y, label="Train", alpha=0.7)
        ax[i, 0].plot(X, a, c="red", lw=3, label="Number of models = " + str(i + 1))
        ax[i, 0].set_title("MSE: " + str(mean_squared_error(y, a)))

        #Prediction of base model (b)
        ax[i, 1].scatter(X, resid[-2], label="Train", alpha=0.7)
        ax[i, 1].plot(X, y_pred[-1], c="red", lw=3)
        ax[i, 1].set_title("Prediction of b " + str(i + 1))

        ax[i, 2].scatter(X, resid[-1], alpha=0.7, c="orange")
        ax[i, 2].set_title("Residuals")

        ax[i, 0].legend()

If we want to repeat the first experiment, we may use the function:

In [None]:
plot_boosting_results(DecisionTreeRegressor(max_depth=1), n_estimators=3, X=X, y=y)

Let's try using linear regression as base model

In [None]:
from sklearn.linear_model import LinearRegression

plot_boosting_results(LinearRegression(), n_estimators=3, X=X, y=y)

Strange... We did not get better model by adding second, third base models. They seem to be constant.


Why? - Because boosting sums the base models: sum of trees seem to be something complex (hard to say, how the trees may merge), but some of linear models is still a linear model (just different coefficients).

The process of building a linear model garantees that we get the best possible (in terms of MSE) first base. The mistakes by first base model cannot be fixed by another linear model.


All in all, boosting with trees is the most weel-known.

### Boosting from sklearn

AdaBoost discussed during lecture:

In [None]:
from sklearn.ensemble import AdaBoostRegressor


ada = AdaBoostRegressor(n_estimators=3)
ada.fit(X, y)

plot_sample_model(
    X, y, plot_predictions=True, y_pred=ada.predict(X), y_pred_label="AdaBoost"
)

Gradient boosting (not discussed, more complex approach, well-known)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor


gb = GradientBoostingRegressor(n_estimators=3)
gb.fit(X, y)

plot_sample_model(
    X, y, plot_predictions=True, y_pred=gb.predict(X), y_pred_label="Gradient boosting"
)

It seems to give strange bad results... This approach is powerful, but it requires a lot of base models and hyperparameter tuning

In [None]:
gb = GradientBoostingRegressor(n_estimators=20)
gb.fit(X, y)

plot_sample_model(
    X, y, plot_predictions=True, y_pred=gb.predict(X), y_pred_label="Gradient boosting"
)

### Overfitting

In [None]:
np.random.seed(123)
X = np.linspace(0, 1, 300).reshape(-1, 1)

y = (X > 0.5) + np.random.normal(size=X.shape) * 0.1
y = y[:, 0]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

plt.figure(figsize=(10, 7))
plt.scatter(X_train, y_train, label="Train")
plt.scatter(X_test, y_test, label="Test")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend();

In [None]:
trees = [1, 2, 5, 20, 100, 500, 1000]


fig, ax = plt.subplots(len(trees), 2, figsize=(30, 40))

loss_rf_train = []
loss_gb_train = []
loss_rf_test = []
loss_gb_test = []

for i, ts in enumerate(trees):
    rf = RandomForestRegressor(n_estimators=ts, max_depth=3, random_state=123)

    #May check GradientBoostingRegressor(max_depth=3,
    gb = AdaBoostRegressor(
        DecisionTreeRegressor(max_depth=3),
        n_estimators=ts,
        learning_rate=0.1,
        random_state=123,
    )

    rf.fit(X_train, y_train)
    ax[i, 0].scatter(X_train, y_train, label="Train")
    ax[i, 0].scatter(X_test, y_test, label="Test")
    ax[i, 0].plot(
        sorted(X_test),
        rf.predict(sorted(X_test)),
        lw=3,
        c="red",
        label="Prediction on test data",
    )
    ax[i, 0].set_xlabel("X")
    ax[i, 0].set_ylabel("Y")
    ax[i, 0].set_title(
        "Random Forest, n trees = "
        + str(ts)
        + ", MSE = "
        + str(mean_squared_error(y_test, rf.predict(X_test)))
    )
    ax[i, 0].legend()

    loss_rf_train.append(mean_squared_error(y_train, rf.predict(X_train)))
    loss_rf_test.append(mean_squared_error(y_test, rf.predict(X_test)))

    gb.fit(X_train, y_train)
    ax[i, 1].scatter(X_train, y_train, label="Train")
    ax[i, 1].scatter(X_test, y_test, label="Test")
    ax[i, 1].plot(
        sorted(X_test),
        gb.predict(sorted(X_test)),
        lw=3,
        c="red",
        label="Prediction on test data",
    )
    ax[i, 1].set_xlabel("X")
    ax[i, 1].set_ylabel("Y")
    ax[i, 1].set_title(
        "Boosting, n trees = "
        + str(ts)
        + ", MSE = "
        + str(mean_squared_error(y_test, gb.predict(X_test)))
    )
    ax[i, 1].legend()

    loss_gb_train.append(mean_squared_error(y_train, gb.predict(X_train)))
    loss_gb_test.append(mean_squared_error(y_test, gb.predict(X_test)))

AdaBoost does not overfit, Gradient boosting may overfit.