From 824705c8f90b80c8f61ebfa9d6f0a7dbe730a66e Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 14 Mar 2022 10:37:30 +0100
Subject: [PATCH 01/15] Fix learning curve to show overlapped error bars

---
 python_scripts/cross_validation_sol_01.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py
index 99c84784d..a357c768a 100644
--- a/python_scripts/cross_validation_sol_01.py
+++ b/python_scripts/cross_validation_sol_01.py
@@ -121,9 +121,9 @@
 import matplotlib.pyplot as plt
 
 plt.errorbar(gammas, train_scores.mean(axis=1),
-             yerr=train_scores.std(axis=1), label='Training score')
+             yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score')
 plt.errorbar(gammas, test_scores.mean(axis=1),
-             yerr=test_scores.std(axis=1), label='Testing score')
+             yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score')
 plt.legend()
 
 plt.xscale("log")
@@ -156,9 +156,9 @@
 
 # %% tags=["solution"]
 plt.errorbar(train_size, train_scores.mean(axis=1),
-             yerr=train_scores.std(axis=1), label='Training score')
+             yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score')
 plt.errorbar(train_size, test_scores.mean(axis=1),
-             yerr=test_scores.std(axis=1), label='Testing score')
+             yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score')
 plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
 
 plt.xlabel("Number of samples in the training set")

From e28e879e4640e19dfe87c305464b96cbbce93c0e Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 14 Mar 2022 10:41:32 +0100
Subject: [PATCH 02/15] Formatting

---
 python_scripts/cross_validation_sol_01.py | 94 ++++++++++++++---------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py
index a357c768a..c969c64b4 100644
--- a/python_scripts/cross_validation_sol_01.py
+++ b/python_scripts/cross_validation_sol_01.py
@@ -10,12 +10,11 @@
 #
 # The aim of this exercise is to make the following experiments:
 #
-# * train and test a support vector machine classifier through
-#   cross-validation;
+# * train and test a support vector machine classifier through cross-validation;
 # * study the effect of the parameter gamma of this classifier using a
 #   validation curve;
-# * use a learning curve to determine the usefulness of adding new
-#   samples in the dataset when building a classifier.
+# * use a learning curve to determine the usefulness of adding new samples in
+#   the dataset when building a classifier.
 #
 # To make these experiments we will first load the blood transfusion dataset.
 
@@ -34,14 +33,14 @@
 
 # %% [markdown]
 # We will use a support vector machine classifier (SVM). In its most simple
-# form, a SVM classifier is a linear classifier behaving similarly to a
-# logistic regression. Indeed, the optimization used to find the optimal
-# weights of the linear model are different but we don't need to know these
-# details for the exercise.
+# form, a SVM classifier is a linear classifier behaving similarly to a logistic
+# regression. Indeed, the optimization used to find the optimal weights of the
+# linear model are different but we don't need to know these details for the
+# exercise.
 #
-# Also, this classifier can become more flexible/expressive by using a
-# so-called kernel that makes the model become non-linear. Again, no requirement
-# regarding the mathematics is required to accomplish this exercise.
+# Also, this classifier can become more flexible/expressive by using a so-called
+# kernel that makes the model become non-linear. Again, no requirement regarding
+# the mathematics is required to accomplish this exercise.
 #
 # We will use an RBF kernel where a parameter `gamma` allows to tune the
 # flexibility of the model.
@@ -63,12 +62,13 @@
 model = make_pipeline(StandardScaler(), SVC())
 
 # %% [markdown]
-# Evaluate the generalization performance of your model by cross-validation with a
-# `ShuffleSplit` scheme. Thus, you can use
+# Evaluate the generalization performance of your model by cross-validation with
+# a `ShuffleSplit` scheme. Thus, you can use
 # [`sklearn.model_selection.cross_validate`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)
-# and pass a [`sklearn.model_selection.ShuffleSplit`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html)
-# to the `cv` parameter. Only fix the `random_state=0` in the `ShuffleSplit`
-# and let the other parameters to the default.
+# and pass a
+# [`sklearn.model_selection.ShuffleSplit`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html)
+# to the `cv` parameter. Only fix the `random_state=0` in the `ShuffleSplit` and
+# let the other parameters to the default.
 
 # %%
 # solution
@@ -91,11 +91,11 @@
 # controlling under/over-fitting in support vector machine with an RBF kernel.
 #
 # Evaluate the effect of the parameter `gamma` by using the
-# [`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html) function.
-# You can leave the default `scoring=None` which is equivalent to
-# `scoring="accuracy"` for classification problems. You can vary `gamma`
-# between `10e-3` and `10e2` by generating samples on a logarithmic scale
-# with the help of `np.logspace(-3, 2, num=30)`.
+# [`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html)
+# function. You can leave the default `scoring=None` which is equivalent to
+# `scoring="accuracy"` for classification problems. You can vary `gamma` between
+# `10e-3` and `10e2` by generating samples on a logarithmic scale with the help
+# of `np.logspace(-3, 2, num=30)`.
 #
 # Since we are manipulating a `Pipeline` the parameter name will be set to
 # `svc__gamma` instead of only `gamma`. You can retrieve the parameter name
@@ -120,10 +120,20 @@
 # solution
 import matplotlib.pyplot as plt
 
-plt.errorbar(gammas, train_scores.mean(axis=1),
-             yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score')
-plt.errorbar(gammas, test_scores.mean(axis=1),
-             yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score')
+plt.errorbar(
+    gammas,
+    train_scores.mean(axis=1),
+    yerr=train_scores.std(axis=1),
+    alpha=0.95,
+    label="Training score",
+)
+plt.errorbar(
+    gammas,
+    test_scores.mean(axis=1),
+    yerr=test_scores.std(axis=1),
+    alpha=0.5,
+    label="Testing score",
+)
 plt.legend()
 
 plt.xscale("log")
@@ -132,11 +142,10 @@
 _ = plt.title("Validation score of support vector machine")
 
 # %% [markdown] tags=["solution"]
-# Looking at the curve, we can clearly identify the over-fitting regime of
-# the SVC classifier when `gamma > 1`.
-# The best setting is around `gamma = 1` while for `gamma < 1`,
-# it is not very clear if the classifier is under-fitting but the
-# testing score is worse than for `gamma = 1`.
+# Looking at the curve, we can clearly identify the over-fitting regime of the
+# SVC classifier when `gamma > 1`. The best setting is around `gamma = 1` while
+# for `gamma < 1`, it is not very clear if the classifier is under-fitting but
+# the testing score is worse than for `gamma = 1`.
 
 # %% [markdown]
 # Now, you can perform an analysis to check whether adding new samples to the
@@ -155,10 +164,20 @@
 train_size, train_scores, test_scores = results[:3]
 
 # %% tags=["solution"]
-plt.errorbar(train_size, train_scores.mean(axis=1),
-             yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score')
-plt.errorbar(train_size, test_scores.mean(axis=1),
-             yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score')
+plt.errorbar(
+    train_size,
+    train_scores.mean(axis=1),
+    yerr=train_scores.std(axis=1),
+    alpha=0.95,
+    label="Training score",
+)
+plt.errorbar(
+    train_size,
+    test_scores.mean(axis=1),
+    yerr=test_scores.std(axis=1),
+    alpha=0.5,
+    label="Testing score",
+)
 plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
 
 plt.xlabel("Number of samples in the training set")
@@ -166,7 +185,6 @@
 _ = plt.title("Learning curve for support vector machine")
 
 # %% [markdown] tags=["solution"]
-# We observe that adding new samples in the dataset does not improve the
-# testing score. We can only conclude that the standard deviation of
-# the training error is decreasing when adding more samples which is not a
-# surprise.
+# We observe that adding new samples in the dataset does not improve the testing
+# score. We can only conclude that the standard deviation of the training error
+# is decreasing when adding more samples which is not a surprise.

From 18e880b6b588254f04a5d869fe9836943885d67f Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Mon, 14 Mar 2022 18:51:39 +0100
Subject: [PATCH 03/15] Update python_scripts/cross_validation_sol_01.py

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 python_scripts/cross_validation_sol_01.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py
index c969c64b4..13776da00 100644
--- a/python_scripts/cross_validation_sol_01.py
+++ b/python_scripts/cross_validation_sol_01.py
@@ -185,6 +185,23 @@
 _ = plt.title("Learning curve for support vector machine")
 
 # %% [markdown] tags=["solution"]
-# We observe that adding new samples in the dataset does not improve the testing
-# score. We can only conclude that the standard deviation of the training error
-# is decreasing when adding more samples which is not a surprise.
+# We observe that adding new samples to the training dataset does
+# not seem to improve the training and testing scores.
+#
+# In particular, 76% accuracy is the score of a model that
+# always predicts the majority class `"not donated"`. This can
+# mean that our small pipeline is not able to use input features
+# to improve upon that simplistic baseline, and increasing the
+# training set size does not help either.
+#
+# It could be the case that the input features are fundamentally
+# not very informative and the classification problem is
+# fundamentally impossible to solve to a high accuracy. But it
+# could also be the case that our choice of using the default
+# hyperparameter value of the `SVC` class was a bad idea, or that
+# the choice of the `SVC` class is itself sub-optimal.
+#
+# Later in this MOOC we will see how to better tune the
+# hyper-parameters of a model and explore how to compare the
+# predictive performance of different model classes in a more
+# systematic way.

From b0dbbfab925ce704b868adb9621918d5e1b938f3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Tue, 15 Mar 2022 10:14:04 +0100
Subject: [PATCH 04/15] Make interpretation more detailed

---
 python_scripts/cross_validation_sol_01.py | 34 +++++++++++------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py
index 13776da00..30b00790c 100644
--- a/python_scripts/cross_validation_sol_01.py
+++ b/python_scripts/cross_validation_sol_01.py
@@ -185,23 +185,21 @@
 _ = plt.title("Learning curve for support vector machine")
 
 # %% [markdown] tags=["solution"]
-# We observe that adding new samples to the training dataset does
-# not seem to improve the training and testing scores.
+# We observe that adding new samples to the training dataset does not seem to
+# improve the training and testing scores. In particular, the testing score
+# oscillates around 76% accuracy. Indeed, ~76% of the samples belong to the
+# class `"not donated"``. Notice then that a classifier that always predicts the
+# `"not donated"`` class would achieve an accuracy of 76% without using any
+# information from the data itself. This can mean that our small pipeline is not
+# able to use the input features to improve upon that simplistic baseline, and
+# increasing the training set size does not help either.
 #
-# In particular, 76% accuracy is the score of a model that
-# always predicts the majority class `"not donated"`. This can
-# mean that our small pipeline is not able to use input features
-# to improve upon that simplistic baseline, and increasing the
-# training set size does not help either.
+# It could be the case that the input features are fundamentally not very
+# informative and the classification problem is fundamentally impossible to
+# solve to a high accuracy. But it could also be the case that our choice of
+# using the default hyperparameter value of the `SVC` class was a bad idea, or
+# that the choice of the `SVC` class is itself sub-optimal.
 #
-# It could be the case that the input features are fundamentally
-# not very informative and the classification problem is
-# fundamentally impossible to solve to a high accuracy. But it
-# could also be the case that our choice of using the default
-# hyperparameter value of the `SVC` class was a bad idea, or that
-# the choice of the `SVC` class is itself sub-optimal.
-#
-# Later in this MOOC we will see how to better tune the
-# hyper-parameters of a model and explore how to compare the
-# predictive performance of different model classes in a more
-# systematic way.
+# Later in this MOOC we will see how to better tune the hyperparameters of a
+# model and explore how to compare the predictive performance of different model
+# classes in a more systematic way.

From a6bc6f7345a7c208ff38f7f733f2f4e47d1deaed Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 8 Sep 2022 14:59:57 +0200
Subject: [PATCH 05/15] Emphasize n_estimators should not be tuned

---
 python_scripts/ensemble_hyperparameters.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 7cd32e40e..a743e1527 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -24,12 +24,18 @@
 #
 # ## Random forest
 #
-# The main parameter to tune for random forest is the `n_estimators` parameter.
+# The main parameter to select in random forest is the `n_estimators` parameter.
 # In general, the more trees in the forest, the better the generalization
 # performance will be. However, it will slow down the fitting and prediction
 # time. The goal is to balance computing time and generalization performance when
 # setting the number of estimators when putting such learner in production.
 #
+# ```{caution}
+# Here, we tune the `n_estimators` but doing so is likely to be a loss of
+# resources. Be aware that using early-stopping as in the previous exercise will
+# be better.
+# ```
+#
 # Then, we could also tune a parameter that controls the depth of each tree in
 # the forest. Two parameters are important for this: `max_depth` and
 # `max_leaf_nodes`. They differ in the way they control the tree structure.
@@ -142,12 +148,6 @@
 cv_results[columns].sort_values(by="mean_test_error")
 
 # %% [markdown]
-#
-# ```{caution}
-# Here, we tune the `n_estimators` but be aware that using early-stopping as
-# in the previous exercise will be better.
-# ```
-#
 # In this search, we see that the `learning_rate` is required to be large
 # enough, i.e. > 0.1. We also observe that for the best ranked models, having a
 # smaller `learning_rate`, will require more trees or a larger number of

From 121696d556e62c48425b7ce8a6df2267d46057e1 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 8 Sep 2022 15:00:26 +0200
Subject: [PATCH 06/15] Unrelated wording fix

---
 python_scripts/ensemble_hyperparameters.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index a743e1527..9257cb24f 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -19,7 +19,8 @@
 # ```{caution}
 # For the sake of clarity, no cross-validation will be used to estimate the
 # testing error. We are only showing the effect of the parameters
-# on the validation set of what should be the inner cross-validation.
+# on the validation set of what should be the inner loop of a nested
+# cross-validation.
 # ```
 #
 # ## Random forest

From 7a52064664d7c018d2e1ff051c010fcd42327f33 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 10 Oct 2022 14:55:35 +0200
Subject: [PATCH 07/15] Revert removal of caution message

---
 python_scripts/ensemble_hyperparameters.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 9257cb24f..91ce86b0a 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -149,6 +149,12 @@
 cv_results[columns].sort_values(by="mean_test_error")
 
 # %% [markdown]
+#
+# ```{caution}
+# Here, we tune the `n_estimators` but be aware that using early-stopping as
+# in the previous exercise will be better.
+# ```
+#
 # In this search, we see that the `learning_rate` is required to be large
 # enough, i.e. > 0.1. We also observe that for the best ranked models, having a
 # smaller `learning_rate`, will require more trees or a larger number of

From 202153e095f816e7f98ee58ac38a390e912a4802 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 10 Oct 2022 16:58:04 +0200
Subject: [PATCH 08/15] Improve general wording

---
 python_scripts/ensemble_hyperparameters.py | 41 +++++++++++-----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 91ce86b0a..402c50813 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -18,8 +18,8 @@
 #
 # ```{caution}
 # For the sake of clarity, no cross-validation will be used to estimate the
-# testing error. We are only showing the effect of the parameters
-# on the validation set of what should be the inner loop of a nested
+# variability of the testing error. We are only showing the effect of the
+# parameters on the validation set of what should be the inner loop of a nested
 # cross-validation.
 # ```
 #
@@ -28,13 +28,13 @@
 # The main parameter to select in random forest is the `n_estimators` parameter.
 # In general, the more trees in the forest, the better the generalization
 # performance will be. However, it will slow down the fitting and prediction
-# time. The goal is to balance computing time and generalization performance when
-# setting the number of estimators when putting such learner in production.
+# time. The goal is to balance computing time and generalization performance
+# when setting the number of estimators when putting such learner in production.
+# Here, we fix `n_estimators=100`, which is already the default value.
 #
 # ```{caution}
-# Here, we tune the `n_estimators` but doing so is likely to be a loss of
-# resources. Be aware that using early-stopping as in the previous exercise will
-# be better.
+# Tuning the `n_estimators` for random forests leads to overfitting and can
+# result in a waste of computer power.
 # ```
 #
 # Then, we could also tune a parameter that controls the depth of each tree in
@@ -80,15 +80,14 @@
 cv_results[columns].sort_values(by="mean_test_error")
 
 # %% [markdown]
-# We can observe in our search that we are required to have a large
-# number of leaves and thus deep trees. This parameter seems particularly
-# impactful in comparison to the number of trees for this particular dataset:
-# with at least 50 trees, the generalization performance will be driven by the
-# number of leaves.
+# We can observe in our search that we are required to have a large number of
+# leaves and thus deep trees. This parameter seems particularly impactful in
+# comparison to the number of features for this particular dataset.
 #
-# Now we will estimate the generalization performance of the best model by
-# refitting it with the full training set and using the test set for scoring on
-# unseen data. This is done by default when calling the `.fit` method.
+# Once the `RandomizedSearchCV` has found the best set of hyperparameters, it
+# uses them to refit the model using the full training set. To estimate the
+# generalization performance of the best model it suffices to call `.score` on
+# the unseen data.
 
 # %%
 error = -search_cv.score(data_test, target_test)
@@ -151,8 +150,8 @@
 # %% [markdown]
 #
 # ```{caution}
-# Here, we tune the `n_estimators` but be aware that using early-stopping as
-# in the previous exercise will be better.
+# Here, we tune the `n_estimators` but be aware that is better to use
+# `early_stopping` as done in the Exercise M6.04.
 # ```
 #
 # In this search, we see that the `learning_rate` is required to be large
@@ -163,8 +162,8 @@
 # on the other hyperparameter values.
 
 # %% [markdown]
-# Now we estimate the generalization performance of the best model
-# using the test set.
+# Now we estimate the generalization performance of the best model using the
+# test set.
 
 # %%
 error = -search_cv.score(data_test, target_test)
@@ -173,5 +172,5 @@
 # %% [markdown]
 # The mean test score in the held-out test set is slightly better than the score
 # of the best model. The reason is that the final model is refitted on the whole
-# training set and therefore, on more data than the inner cross-validated models
-# of the grid search procedure.
+# training set and therefore, on more data than the cross-validated models of
+# the grid search procedure.

From bdbc3926ab712d58c9222f4c49bff3836f25bc1b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 10 Oct 2022 16:58:59 +0200
Subject: [PATCH 09/15] Refactor narrative

---
 python_scripts/ensemble_hyperparameters.py | 41 ++++++++++++++--------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 402c50813..848868a25 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -23,6 +23,18 @@
 # cross-validation.
 # ```
 #
+# We will start by loading the california housing dataset.
+
+# %%
+from sklearn.datasets import fetch_california_housing
+from sklearn.model_selection import train_test_split
+
+data, target = fetch_california_housing(return_X_y=True, as_frame=True)
+target *= 100  # rescale the target in k$
+data_train, data_test, target_train, target_test = train_test_split(
+    data, target, random_state=0)
+
+# %% [markdown]
 # ## Random forest
 #
 # The main parameter to select in random forest is the `n_estimators` parameter.
@@ -37,25 +49,24 @@
 # result in a waste of computer power.
 # ```
 #
-# Then, we could also tune a parameter that controls the depth of each tree in
-# the forest. Two parameters are important for this: `max_depth` and
+# Instead, we can tune the hyperparameter `max_features`, which controls the
+# number of features to consider when looking for the best split. If set to
+# `None`, then `max_features=n_features`.
+
+# %%
+print(f"In this case, n_features={len(data.columns)}")
+
+# %% [markdown]
+# We can also tune the different parameters that control the depth of each tree
+# in the forest. Two parameters are important for this: `max_depth` and
 # `max_leaf_nodes`. They differ in the way they control the tree structure.
 # Indeed, `max_depth` will enforce to have a more symmetric tree, while
 # `max_leaf_nodes` does not impose such constraint.
 #
-# Be aware that with random forest, trees are generally deep since we are
-# seeking to overfit each tree on each bootstrap sample because this will be
-# mitigated by combining them altogether. Assembling underfitted trees (i.e.
-# shallow trees) might also lead to an underfitted forest.
-
-# %%
-from sklearn.datasets import fetch_california_housing
-from sklearn.model_selection import train_test_split
-
-data, target = fetch_california_housing(return_X_y=True, as_frame=True)
-target *= 100  # rescale the target in k$
-data_train, data_test, target_train, target_test = train_test_split(
-    data, target, random_state=0)
+# Be aware that with random forest, trees are expected to be deep since we are
+# seeking to overfit each tree on each bootstrap sample. Overfitting is
+# mitigated when combining the trees altogether, whereas assembling underfitted
+# trees (i.e. shallow trees) might also lead to an underfitted forest.
 
 # %%
 import pandas as pd

From a18e9fd4bf215e612c85f71aa17c8cc1037ada81 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 10 Oct 2022 17:00:12 +0200
Subject: [PATCH 10/15] Avoid tunning n_estimators and tune max_features
 instead

---
 python_scripts/ensemble_hyperparameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 848868a25..e7b51b2de 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -74,7 +74,7 @@
 from sklearn.ensemble import RandomForestRegressor
 
 param_distributions = {
-    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
+    "max_features": [1, 2, 3, 5, None],
     "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
 }
 search_cv = RandomizedSearchCV(

From 826f3897a48a6b4123f89b6902b5af4dd7b529b3 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 10 Oct 2022 17:07:07 +0200
Subject: [PATCH 11/15] Simplify paragraph

---
 python_scripts/ensemble_hyperparameters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index e7b51b2de..0c4b0621e 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -41,8 +41,8 @@
 # In general, the more trees in the forest, the better the generalization
 # performance will be. However, it will slow down the fitting and prediction
 # time. The goal is to balance computing time and generalization performance
-# when setting the number of estimators when putting such learner in production.
-# Here, we fix `n_estimators=100`, which is already the default value.
+# when setting the number of estimators. Here, we fix `n_estimators=100`, which
+# is already the default value.
 #
 # ```{caution}
 # Tuning the `n_estimators` for random forests leads to overfitting and can

From 274f397bbc145784039f28ad6013fb00cddb4925 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Wed, 12 Oct 2022 17:04:13 +0200
Subject: [PATCH 12/15] Apply suggestions from code review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 python_scripts/ensemble_hyperparameters.py | 26 +++++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 0c4b0621e..1207c6aa0 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -45,13 +45,21 @@
 # is already the default value.
 #
 # ```{caution}
-# Tuning the `n_estimators` for random forests leads to overfitting and can
-# result in a waste of computer power.
+# Tuning the `n_estimators` for random forests generally result in a waste of
+# computer power. We just need to ensure that it is large enough so that doubling
+# its value does not lead to a significant improvement of the validation error.
 # ```
 #
 # Instead, we can tune the hyperparameter `max_features`, which controls the
-# number of features to consider when looking for the best split. If set to
-# `None`, then `max_features=n_features`.
+# size of the random subset of features to consider when looking for the best
+# split when growing the trees: smaller values for `max_features` will lead to
+# more random trees with hopefully more uncorrelated prediction errors. However
+# if `max_features` is too small, predictions can be too random, even after
+# averaging with the tree in the ensemble.
+#
+# If `max_features` is set to `None`, then this is equivalent to setting
+# `max_features=n_features` which means that the
+# only source of randomness in the random forest is the bagging procedure.
 
 # %%
 print(f"In this case, n_features={len(data.columns)}")
@@ -75,7 +83,8 @@
 
 param_distributions = {
     "max_features": [1, 2, 3, 5, None],
-    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
+    "max_leaf_nodes": [10, 100, 1000, None],
+    "min_samples_leaf": [1, 2, 5, 10, 20, 50, 100],
 }
 search_cv = RandomizedSearchCV(
     RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,
@@ -93,7 +102,12 @@
 # %% [markdown]
 # We can observe in our search that we are required to have a large number of
 # leaves and thus deep trees. This parameter seems particularly impactful in
-# comparison to the number of features for this particular dataset.
+# the other tuning parameters but more iterations of random search would be
+# necessary to precisely assert the role of each parameters.
+#
+# Using `n_iter=10` is good enough to quickly find a hyper-parameter combination
+# that yields a model that works well enough without wasting too much
+# computational resources.
 #
 # Once the `RandomizedSearchCV` has found the best set of hyperparameters, it
 # uses them to refit the model using the full training set. To estimate the

From 697240d86dffc26bbefd13b694aabe661ea6d7f8 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 12 Oct 2022 17:20:22 +0200
Subject: [PATCH 13/15] tweaks

---
 python_scripts/ensemble_hyperparameters.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 1207c6aa0..37c13b41c 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -55,11 +55,11 @@
 # split when growing the trees: smaller values for `max_features` will lead to
 # more random trees with hopefully more uncorrelated prediction errors. However
 # if `max_features` is too small, predictions can be too random, even after
-# averaging with the tree in the ensemble.
+# averaging with the trees in the ensemble.
 #
 # If `max_features` is set to `None`, then this is equivalent to setting
-# `max_features=n_features` which means that the
-# only source of randomness in the random forest is the bagging procedure.
+# `max_features=n_features` which means that the only source of randomness in
+# the random forest is the bagging procedure.
 
 # %%
 print(f"In this case, n_features={len(data.columns)}")
@@ -105,7 +105,7 @@
 # the other tuning parameters but more iterations of random search would be
 # necessary to precisely assert the role of each parameters.
 #
-# Using `n_iter=10` is good enough to quickly find a hyper-parameter combination
+# Using `n_iter=10` is good enough to quickly find a hyperparameter combination
 # that yields a model that works well enough without wasting too much
 # computational resources.
 #

From 8ed1d1ea90b7367536a4241858effc3c845716f1 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 12 Oct 2022 17:21:05 +0200
Subject: [PATCH 14/15] Add description of min_samples_leaf hyperparameter

---
 python_scripts/ensemble_hyperparameters.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 37c13b41c..af00a461d 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -71,6 +71,13 @@
 # Indeed, `max_depth` will enforce to have a more symmetric tree, while
 # `max_leaf_nodes` does not impose such constraint.
 #
+# The hyperparameter `min_samples_leaf` controls the minimum number of samples
+# required to be at a leaf node. This means that a split point (at any depth) is
+# only done if it leaves at least `min_samples_leaf` training samples in each of
+# the left and right branches. A small value for `min_samples_leaf` means that
+# some samples can become isolated when a tree is deep, promoting overfitting. A
+# large value would prevent deep trees, which can lead to underfitting.
+#
 # Be aware that with random forest, trees are expected to be deep since we are
 # seeking to overfit each tree on each bootstrap sample. Overfitting is
 # mitigated when combining the trees altogether, whereas assembling underfitted

From 43509861636d80597fa8bb52d88b69a164f4966d Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 12 Oct 2022 17:34:39 +0200
Subject: [PATCH 15/15] Improve wording

---
 python_scripts/ensemble_hyperparameters.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index af00a461d..6c90bc940 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -69,7 +69,8 @@
 # in the forest. Two parameters are important for this: `max_depth` and
 # `max_leaf_nodes`. They differ in the way they control the tree structure.
 # Indeed, `max_depth` will enforce to have a more symmetric tree, while
-# `max_leaf_nodes` does not impose such constraint.
+# `max_leaf_nodes` does not impose such constraint. If `max_leaf_nodes=None`
+# then the number of leaf nodes is unlimited.
 #
 # The hyperparameter `min_samples_leaf` controls the minimum number of samples
 # required to be at a leaf node. This means that a split point (at any depth) is
@@ -108,13 +109,15 @@
 
 # %% [markdown]
 # We can observe in our search that we are required to have a large number of
-# leaves and thus deep trees. This parameter seems particularly impactful in
-# the other tuning parameters but more iterations of random search would be
-# necessary to precisely assert the role of each parameters.
-#
-# Using `n_iter=10` is good enough to quickly find a hyperparameter combination
-# that yields a model that works well enough without wasting too much
-# computational resources.
+# `max_leaf_nodes` and thus deep trees. This parameter seems particularly
+# impactful with respect to the other tuning parameters, but large values of
+# `min_samples_leaf` seem to reduce the performance of the model.
+#
+# In practice, more iterations of random search would be necessary to precisely
+# assert the role of each parameters. Using `n_iter=10` is good enough to
+# quickly inspect the hyperparameter combinations that yield models that work
+# well enough without spending too much computational resources. Feel free to
+# try more interations on your own.
 #
 # Once the `RandomizedSearchCV` has found the best set of hyperparameters, it
 # uses them to refit the model using the full training set. To estimate the