From 824705c8f90b80c8f61ebfa9d6f0a7dbe730a66e Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 14 Mar 2022 10:37:30 +0100 Subject: [PATCH 01/15] Fix learning curve to show overlapped error bars --- python_scripts/cross_validation_sol_01.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index 99c84784d..a357c768a 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -121,9 +121,9 @@ import matplotlib.pyplot as plt plt.errorbar(gammas, train_scores.mean(axis=1), - yerr=train_scores.std(axis=1), label='Training score') + yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score') plt.errorbar(gammas, test_scores.mean(axis=1), - yerr=test_scores.std(axis=1), label='Testing score') + yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score') plt.legend() plt.xscale("log") @@ -156,9 +156,9 @@ # %% tags=["solution"] plt.errorbar(train_size, train_scores.mean(axis=1), - yerr=train_scores.std(axis=1), label='Training score') + yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score') plt.errorbar(train_size, test_scores.mean(axis=1), - yerr=test_scores.std(axis=1), label='Testing score') + yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score') plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.xlabel("Number of samples in the training set") From e28e879e4640e19dfe87c305464b96cbbce93c0e Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 14 Mar 2022 10:41:32 +0100 Subject: [PATCH 02/15] Formatting --- python_scripts/cross_validation_sol_01.py | 94 ++++++++++++++--------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index a357c768a..c969c64b4 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -10,12 +10,11 @@ # # The aim of this exercise is to make the following experiments: # -# * train and test a support vector machine classifier through -# cross-validation; +# * train and test a support vector machine classifier through cross-validation; # * study the effect of the parameter gamma of this classifier using a # validation curve; -# * use a learning curve to determine the usefulness of adding new -# samples in the dataset when building a classifier. +# * use a learning curve to determine the usefulness of adding new samples in +# the dataset when building a classifier. # # To make these experiments we will first load the blood transfusion dataset. @@ -34,14 +33,14 @@ # %% [markdown] # We will use a support vector machine classifier (SVM). In its most simple -# form, a SVM classifier is a linear classifier behaving similarly to a -# logistic regression. Indeed, the optimization used to find the optimal -# weights of the linear model are different but we don't need to know these -# details for the exercise. +# form, a SVM classifier is a linear classifier behaving similarly to a logistic +# regression. Indeed, the optimization used to find the optimal weights of the +# linear model are different but we don't need to know these details for the +# exercise. # -# Also, this classifier can become more flexible/expressive by using a -# so-called kernel that makes the model become non-linear. Again, no requirement -# regarding the mathematics is required to accomplish this exercise. +# Also, this classifier can become more flexible/expressive by using a so-called +# kernel that makes the model become non-linear. Again, no requirement regarding +# the mathematics is required to accomplish this exercise. # # We will use an RBF kernel where a parameter `gamma` allows to tune the # flexibility of the model. @@ -63,12 +62,13 @@ model = make_pipeline(StandardScaler(), SVC()) # %% [markdown] -# Evaluate the generalization performance of your model by cross-validation with a -# `ShuffleSplit` scheme. Thus, you can use +# Evaluate the generalization performance of your model by cross-validation with +# a `ShuffleSplit` scheme. Thus, you can use # [`sklearn.model_selection.cross_validate`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html) -# and pass a [`sklearn.model_selection.ShuffleSplit`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html) -# to the `cv` parameter. Only fix the `random_state=0` in the `ShuffleSplit` -# and let the other parameters to the default. +# and pass a +# [`sklearn.model_selection.ShuffleSplit`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html) +# to the `cv` parameter. Only fix the `random_state=0` in the `ShuffleSplit` and +# let the other parameters to the default. # %% # solution @@ -91,11 +91,11 @@ # controlling under/over-fitting in support vector machine with an RBF kernel. # # Evaluate the effect of the parameter `gamma` by using the -# [`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html) function. -# You can leave the default `scoring=None` which is equivalent to -# `scoring="accuracy"` for classification problems. You can vary `gamma` -# between `10e-3` and `10e2` by generating samples on a logarithmic scale -# with the help of `np.logspace(-3, 2, num=30)`. +# [`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html) +# function. You can leave the default `scoring=None` which is equivalent to +# `scoring="accuracy"` for classification problems. You can vary `gamma` between +# `10e-3` and `10e2` by generating samples on a logarithmic scale with the help +# of `np.logspace(-3, 2, num=30)`. # # Since we are manipulating a `Pipeline` the parameter name will be set to # `svc__gamma` instead of only `gamma`. You can retrieve the parameter name @@ -120,10 +120,20 @@ # solution import matplotlib.pyplot as plt -plt.errorbar(gammas, train_scores.mean(axis=1), - yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score') -plt.errorbar(gammas, test_scores.mean(axis=1), - yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score') +plt.errorbar( + gammas, + train_scores.mean(axis=1), + yerr=train_scores.std(axis=1), + alpha=0.95, + label="Training score", +) +plt.errorbar( + gammas, + test_scores.mean(axis=1), + yerr=test_scores.std(axis=1), + alpha=0.5, + label="Testing score", +) plt.legend() plt.xscale("log") @@ -132,11 +142,10 @@ _ = plt.title("Validation score of support vector machine") # %% [markdown] tags=["solution"] -# Looking at the curve, we can clearly identify the over-fitting regime of -# the SVC classifier when `gamma > 1`. -# The best setting is around `gamma = 1` while for `gamma < 1`, -# it is not very clear if the classifier is under-fitting but the -# testing score is worse than for `gamma = 1`. +# Looking at the curve, we can clearly identify the over-fitting regime of the +# SVC classifier when `gamma > 1`. The best setting is around `gamma = 1` while +# for `gamma < 1`, it is not very clear if the classifier is under-fitting but +# the testing score is worse than for `gamma = 1`. # %% [markdown] # Now, you can perform an analysis to check whether adding new samples to the @@ -155,10 +164,20 @@ train_size, train_scores, test_scores = results[:3] # %% tags=["solution"] -plt.errorbar(train_size, train_scores.mean(axis=1), - yerr=train_scores.std(axis=1), alpha = 0.95, label='Training score') -plt.errorbar(train_size, test_scores.mean(axis=1), - yerr=test_scores.std(axis=1), alpha = 0.5, label='Testing score') +plt.errorbar( + train_size, + train_scores.mean(axis=1), + yerr=train_scores.std(axis=1), + alpha=0.95, + label="Training score", +) +plt.errorbar( + train_size, + test_scores.mean(axis=1), + yerr=test_scores.std(axis=1), + alpha=0.5, + label="Testing score", +) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.xlabel("Number of samples in the training set") @@ -166,7 +185,6 @@ _ = plt.title("Learning curve for support vector machine") # %% [markdown] tags=["solution"] -# We observe that adding new samples in the dataset does not improve the -# testing score. We can only conclude that the standard deviation of -# the training error is decreasing when adding more samples which is not a -# surprise. +# We observe that adding new samples in the dataset does not improve the testing +# score. We can only conclude that the standard deviation of the training error +# is decreasing when adding more samples which is not a surprise. From 18e880b6b588254f04a5d869fe9836943885d67f Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 14 Mar 2022 18:51:39 +0100 Subject: [PATCH 03/15] Update python_scripts/cross_validation_sol_01.py Co-authored-by: Olivier Grisel --- python_scripts/cross_validation_sol_01.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index c969c64b4..13776da00 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -185,6 +185,23 @@ _ = plt.title("Learning curve for support vector machine") # %% [markdown] tags=["solution"] -# We observe that adding new samples in the dataset does not improve the testing -# score. We can only conclude that the standard deviation of the training error -# is decreasing when adding more samples which is not a surprise. +# We observe that adding new samples to the training dataset does +# not seem to improve the training and testing scores. +# +# In particular, 76% accuracy is the score of a model that +# always predicts the majority class `"not donated"`. This can +# mean that our small pipeline is not able to use input features +# to improve upon that simplistic baseline, and increasing the +# training set size does not help either. +# +# It could be the case that the input features are fundamentally +# not very informative and the classification problem is +# fundamentally impossible to solve to a high accuracy. But it +# could also be the case that our choice of using the default +# hyperparameter value of the `SVC` class was a bad idea, or that +# the choice of the `SVC` class is itself sub-optimal. +# +# Later in this MOOC we will see how to better tune the +# hyper-parameters of a model and explore how to compare the +# predictive performance of different model classes in a more +# systematic way. From b0dbbfab925ce704b868adb9621918d5e1b938f3 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Tue, 15 Mar 2022 10:14:04 +0100 Subject: [PATCH 04/15] Make interpretation more detailed --- python_scripts/cross_validation_sol_01.py | 34 +++++++++++------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index 13776da00..30b00790c 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -185,23 +185,21 @@ _ = plt.title("Learning curve for support vector machine") # %% [markdown] tags=["solution"] -# We observe that adding new samples to the training dataset does -# not seem to improve the training and testing scores. +# We observe that adding new samples to the training dataset does not seem to +# improve the training and testing scores. In particular, the testing score +# oscillates around 76% accuracy. Indeed, ~76% of the samples belong to the +# class `"not donated"``. Notice then that a classifier that always predicts the +# `"not donated"`` class would achieve an accuracy of 76% without using any +# information from the data itself. This can mean that our small pipeline is not +# able to use the input features to improve upon that simplistic baseline, and +# increasing the training set size does not help either. # -# In particular, 76% accuracy is the score of a model that -# always predicts the majority class `"not donated"`. This can -# mean that our small pipeline is not able to use input features -# to improve upon that simplistic baseline, and increasing the -# training set size does not help either. +# It could be the case that the input features are fundamentally not very +# informative and the classification problem is fundamentally impossible to +# solve to a high accuracy. But it could also be the case that our choice of +# using the default hyperparameter value of the `SVC` class was a bad idea, or +# that the choice of the `SVC` class is itself sub-optimal. # -# It could be the case that the input features are fundamentally -# not very informative and the classification problem is -# fundamentally impossible to solve to a high accuracy. But it -# could also be the case that our choice of using the default -# hyperparameter value of the `SVC` class was a bad idea, or that -# the choice of the `SVC` class is itself sub-optimal. -# -# Later in this MOOC we will see how to better tune the -# hyper-parameters of a model and explore how to compare the -# predictive performance of different model classes in a more -# systematic way. +# Later in this MOOC we will see how to better tune the hyperparameters of a +# model and explore how to compare the predictive performance of different model +# classes in a more systematic way. From a6bc6f7345a7c208ff38f7f733f2f4e47d1deaed Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 8 Sep 2022 14:59:57 +0200 Subject: [PATCH 05/15] Emphasize n_estimators should not be tuned --- python_scripts/ensemble_hyperparameters.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 7cd32e40e..a743e1527 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -24,12 +24,18 @@ # # ## Random forest # -# The main parameter to tune for random forest is the `n_estimators` parameter. +# The main parameter to select in random forest is the `n_estimators` parameter. # In general, the more trees in the forest, the better the generalization # performance will be. However, it will slow down the fitting and prediction # time. The goal is to balance computing time and generalization performance when # setting the number of estimators when putting such learner in production. # +# ```{caution} +# Here, we tune the `n_estimators` but doing so is likely to be a loss of +# resources. Be aware that using early-stopping as in the previous exercise will +# be better. +# ``` +# # Then, we could also tune a parameter that controls the depth of each tree in # the forest. Two parameters are important for this: `max_depth` and # `max_leaf_nodes`. They differ in the way they control the tree structure. @@ -142,12 +148,6 @@ cv_results[columns].sort_values(by="mean_test_error") # %% [markdown] -# -# ```{caution} -# Here, we tune the `n_estimators` but be aware that using early-stopping as -# in the previous exercise will be better. -# ``` -# # In this search, we see that the `learning_rate` is required to be large # enough, i.e. > 0.1. We also observe that for the best ranked models, having a # smaller `learning_rate`, will require more trees or a larger number of From 121696d556e62c48425b7ce8a6df2267d46057e1 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 8 Sep 2022 15:00:26 +0200 Subject: [PATCH 06/15] Unrelated wording fix --- python_scripts/ensemble_hyperparameters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index a743e1527..9257cb24f 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -19,7 +19,8 @@ # ```{caution} # For the sake of clarity, no cross-validation will be used to estimate the # testing error. We are only showing the effect of the parameters -# on the validation set of what should be the inner cross-validation. +# on the validation set of what should be the inner loop of a nested +# cross-validation. # ``` # # ## Random forest From 7a52064664d7c018d2e1ff051c010fcd42327f33 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 10 Oct 2022 14:55:35 +0200 Subject: [PATCH 07/15] Revert removal of caution message --- python_scripts/ensemble_hyperparameters.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 9257cb24f..91ce86b0a 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -149,6 +149,12 @@ cv_results[columns].sort_values(by="mean_test_error") # %% [markdown] +# +# ```{caution} +# Here, we tune the `n_estimators` but be aware that using early-stopping as +# in the previous exercise will be better. +# ``` +# # In this search, we see that the `learning_rate` is required to be large # enough, i.e. > 0.1. We also observe that for the best ranked models, having a # smaller `learning_rate`, will require more trees or a larger number of From 202153e095f816e7f98ee58ac38a390e912a4802 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 10 Oct 2022 16:58:04 +0200 Subject: [PATCH 08/15] Improve general wording --- python_scripts/ensemble_hyperparameters.py | 41 +++++++++++----------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 91ce86b0a..402c50813 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -18,8 +18,8 @@ # # ```{caution} # For the sake of clarity, no cross-validation will be used to estimate the -# testing error. We are only showing the effect of the parameters -# on the validation set of what should be the inner loop of a nested +# variability of the testing error. We are only showing the effect of the +# parameters on the validation set of what should be the inner loop of a nested # cross-validation. # ``` # @@ -28,13 +28,13 @@ # The main parameter to select in random forest is the `n_estimators` parameter. # In general, the more trees in the forest, the better the generalization # performance will be. However, it will slow down the fitting and prediction -# time. The goal is to balance computing time and generalization performance when -# setting the number of estimators when putting such learner in production. +# time. The goal is to balance computing time and generalization performance +# when setting the number of estimators when putting such learner in production. +# Here, we fix `n_estimators=100`, which is already the default value. # # ```{caution} -# Here, we tune the `n_estimators` but doing so is likely to be a loss of -# resources. Be aware that using early-stopping as in the previous exercise will -# be better. +# Tuning the `n_estimators` for random forests leads to overfitting and can +# result in a waste of computer power. # ``` # # Then, we could also tune a parameter that controls the depth of each tree in @@ -80,15 +80,14 @@ cv_results[columns].sort_values(by="mean_test_error") # %% [markdown] -# We can observe in our search that we are required to have a large -# number of leaves and thus deep trees. This parameter seems particularly -# impactful in comparison to the number of trees for this particular dataset: -# with at least 50 trees, the generalization performance will be driven by the -# number of leaves. +# We can observe in our search that we are required to have a large number of +# leaves and thus deep trees. This parameter seems particularly impactful in +# comparison to the number of features for this particular dataset. # -# Now we will estimate the generalization performance of the best model by -# refitting it with the full training set and using the test set for scoring on -# unseen data. This is done by default when calling the `.fit` method. +# Once the `RandomizedSearchCV` has found the best set of hyperparameters, it +# uses them to refit the model using the full training set. To estimate the +# generalization performance of the best model it suffices to call `.score` on +# the unseen data. # %% error = -search_cv.score(data_test, target_test) @@ -151,8 +150,8 @@ # %% [markdown] # # ```{caution} -# Here, we tune the `n_estimators` but be aware that using early-stopping as -# in the previous exercise will be better. +# Here, we tune the `n_estimators` but be aware that is better to use +# `early_stopping` as done in the Exercise M6.04. # ``` # # In this search, we see that the `learning_rate` is required to be large @@ -163,8 +162,8 @@ # on the other hyperparameter values. # %% [markdown] -# Now we estimate the generalization performance of the best model -# using the test set. +# Now we estimate the generalization performance of the best model using the +# test set. # %% error = -search_cv.score(data_test, target_test) @@ -173,5 +172,5 @@ # %% [markdown] # The mean test score in the held-out test set is slightly better than the score # of the best model. The reason is that the final model is refitted on the whole -# training set and therefore, on more data than the inner cross-validated models -# of the grid search procedure. +# training set and therefore, on more data than the cross-validated models of +# the grid search procedure. From bdbc3926ab712d58c9222f4c49bff3836f25bc1b Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 10 Oct 2022 16:58:59 +0200 Subject: [PATCH 09/15] Refactor narrative --- python_scripts/ensemble_hyperparameters.py | 41 ++++++++++++++-------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 402c50813..848868a25 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -23,6 +23,18 @@ # cross-validation. # ``` # +# We will start by loading the california housing dataset. + +# %% +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split + +data, target = fetch_california_housing(return_X_y=True, as_frame=True) +target *= 100 # rescale the target in k$ +data_train, data_test, target_train, target_test = train_test_split( + data, target, random_state=0) + +# %% [markdown] # ## Random forest # # The main parameter to select in random forest is the `n_estimators` parameter. @@ -37,25 +49,24 @@ # result in a waste of computer power. # ``` # -# Then, we could also tune a parameter that controls the depth of each tree in -# the forest. Two parameters are important for this: `max_depth` and +# Instead, we can tune the hyperparameter `max_features`, which controls the +# number of features to consider when looking for the best split. If set to +# `None`, then `max_features=n_features`. + +# %% +print(f"In this case, n_features={len(data.columns)}") + +# %% [markdown] +# We can also tune the different parameters that control the depth of each tree +# in the forest. Two parameters are important for this: `max_depth` and # `max_leaf_nodes`. They differ in the way they control the tree structure. # Indeed, `max_depth` will enforce to have a more symmetric tree, while # `max_leaf_nodes` does not impose such constraint. # -# Be aware that with random forest, trees are generally deep since we are -# seeking to overfit each tree on each bootstrap sample because this will be -# mitigated by combining them altogether. Assembling underfitted trees (i.e. -# shallow trees) might also lead to an underfitted forest. - -# %% -from sklearn.datasets import fetch_california_housing -from sklearn.model_selection import train_test_split - -data, target = fetch_california_housing(return_X_y=True, as_frame=True) -target *= 100 # rescale the target in k$ -data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) +# Be aware that with random forest, trees are expected to be deep since we are +# seeking to overfit each tree on each bootstrap sample. Overfitting is +# mitigated when combining the trees altogether, whereas assembling underfitted +# trees (i.e. shallow trees) might also lead to an underfitted forest. # %% import pandas as pd From a18e9fd4bf215e612c85f71aa17c8cc1037ada81 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 10 Oct 2022 17:00:12 +0200 Subject: [PATCH 10/15] Avoid tunning n_estimators and tune max_features instead --- python_scripts/ensemble_hyperparameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 848868a25..e7b51b2de 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -74,7 +74,7 @@ from sklearn.ensemble import RandomForestRegressor param_distributions = { - "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500], + "max_features": [1, 2, 3, 5, None], "max_leaf_nodes": [2, 5, 10, 20, 50, 100], } search_cv = RandomizedSearchCV( From 826f3897a48a6b4123f89b6902b5af4dd7b529b3 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 10 Oct 2022 17:07:07 +0200 Subject: [PATCH 11/15] Simplify paragraph --- python_scripts/ensemble_hyperparameters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index e7b51b2de..0c4b0621e 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -41,8 +41,8 @@ # In general, the more trees in the forest, the better the generalization # performance will be. However, it will slow down the fitting and prediction # time. The goal is to balance computing time and generalization performance -# when setting the number of estimators when putting such learner in production. -# Here, we fix `n_estimators=100`, which is already the default value. +# when setting the number of estimators. Here, we fix `n_estimators=100`, which +# is already the default value. # # ```{caution} # Tuning the `n_estimators` for random forests leads to overfitting and can From 274f397bbc145784039f28ad6013fb00cddb4925 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 12 Oct 2022 17:04:13 +0200 Subject: [PATCH 12/15] Apply suggestions from code review Co-authored-by: Olivier Grisel --- python_scripts/ensemble_hyperparameters.py | 26 +++++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 0c4b0621e..1207c6aa0 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -45,13 +45,21 @@ # is already the default value. # # ```{caution} -# Tuning the `n_estimators` for random forests leads to overfitting and can -# result in a waste of computer power. +# Tuning the `n_estimators` for random forests generally result in a waste of +# computer power. We just need to ensure that it is large enough so that doubling +# its value does not lead to a significant improvement of the validation error. # ``` # # Instead, we can tune the hyperparameter `max_features`, which controls the -# number of features to consider when looking for the best split. If set to -# `None`, then `max_features=n_features`. +# size of the random subset of features to consider when looking for the best +# split when growing the trees: smaller values for `max_features` will lead to +# more random trees with hopefully more uncorrelated prediction errors. However +# if `max_features` is too small, predictions can be too random, even after +# averaging with the tree in the ensemble. +# +# If `max_features` is set to `None`, then this is equivalent to setting +# `max_features=n_features` which means that the +# only source of randomness in the random forest is the bagging procedure. # %% print(f"In this case, n_features={len(data.columns)}") @@ -75,7 +83,8 @@ param_distributions = { "max_features": [1, 2, 3, 5, None], - "max_leaf_nodes": [2, 5, 10, 20, 50, 100], + "max_leaf_nodes": [10, 100, 1000, None], + "min_samples_leaf": [1, 2, 5, 10, 20, 50, 100], } search_cv = RandomizedSearchCV( RandomForestRegressor(n_jobs=2), param_distributions=param_distributions, @@ -93,7 +102,12 @@ # %% [markdown] # We can observe in our search that we are required to have a large number of # leaves and thus deep trees. This parameter seems particularly impactful in -# comparison to the number of features for this particular dataset. +# the other tuning parameters but more iterations of random search would be +# necessary to precisely assert the role of each parameters. +# +# Using `n_iter=10` is good enough to quickly find a hyper-parameter combination +# that yields a model that works well enough without wasting too much +# computational resources. # # Once the `RandomizedSearchCV` has found the best set of hyperparameters, it # uses them to refit the model using the full training set. To estimate the From 697240d86dffc26bbefd13b694aabe661ea6d7f8 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 12 Oct 2022 17:20:22 +0200 Subject: [PATCH 13/15] tweaks --- python_scripts/ensemble_hyperparameters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 1207c6aa0..37c13b41c 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -55,11 +55,11 @@ # split when growing the trees: smaller values for `max_features` will lead to # more random trees with hopefully more uncorrelated prediction errors. However # if `max_features` is too small, predictions can be too random, even after -# averaging with the tree in the ensemble. +# averaging with the trees in the ensemble. # # If `max_features` is set to `None`, then this is equivalent to setting -# `max_features=n_features` which means that the -# only source of randomness in the random forest is the bagging procedure. +# `max_features=n_features` which means that the only source of randomness in +# the random forest is the bagging procedure. # %% print(f"In this case, n_features={len(data.columns)}") @@ -105,7 +105,7 @@ # the other tuning parameters but more iterations of random search would be # necessary to precisely assert the role of each parameters. # -# Using `n_iter=10` is good enough to quickly find a hyper-parameter combination +# Using `n_iter=10` is good enough to quickly find a hyperparameter combination # that yields a model that works well enough without wasting too much # computational resources. # From 8ed1d1ea90b7367536a4241858effc3c845716f1 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 12 Oct 2022 17:21:05 +0200 Subject: [PATCH 14/15] Add description of min_samples_leaf hyperparameter --- python_scripts/ensemble_hyperparameters.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 37c13b41c..af00a461d 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -71,6 +71,13 @@ # Indeed, `max_depth` will enforce to have a more symmetric tree, while # `max_leaf_nodes` does not impose such constraint. # +# The hyperparameter `min_samples_leaf` controls the minimum number of samples +# required to be at a leaf node. This means that a split point (at any depth) is +# only done if it leaves at least `min_samples_leaf` training samples in each of +# the left and right branches. A small value for `min_samples_leaf` means that +# some samples can become isolated when a tree is deep, promoting overfitting. A +# large value would prevent deep trees, which can lead to underfitting. +# # Be aware that with random forest, trees are expected to be deep since we are # seeking to overfit each tree on each bootstrap sample. Overfitting is # mitigated when combining the trees altogether, whereas assembling underfitted From 43509861636d80597fa8bb52d88b69a164f4966d Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 12 Oct 2022 17:34:39 +0200 Subject: [PATCH 15/15] Improve wording --- python_scripts/ensemble_hyperparameters.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index af00a461d..6c90bc940 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -69,7 +69,8 @@ # in the forest. Two parameters are important for this: `max_depth` and # `max_leaf_nodes`. They differ in the way they control the tree structure. # Indeed, `max_depth` will enforce to have a more symmetric tree, while -# `max_leaf_nodes` does not impose such constraint. +# `max_leaf_nodes` does not impose such constraint. If `max_leaf_nodes=None` +# then the number of leaf nodes is unlimited. # # The hyperparameter `min_samples_leaf` controls the minimum number of samples # required to be at a leaf node. This means that a split point (at any depth) is @@ -108,13 +109,15 @@ # %% [markdown] # We can observe in our search that we are required to have a large number of -# leaves and thus deep trees. This parameter seems particularly impactful in -# the other tuning parameters but more iterations of random search would be -# necessary to precisely assert the role of each parameters. -# -# Using `n_iter=10` is good enough to quickly find a hyperparameter combination -# that yields a model that works well enough without wasting too much -# computational resources. +# `max_leaf_nodes` and thus deep trees. This parameter seems particularly +# impactful with respect to the other tuning parameters, but large values of +# `min_samples_leaf` seem to reduce the performance of the model. +# +# In practice, more iterations of random search would be necessary to precisely +# assert the role of each parameters. Using `n_iter=10` is good enough to +# quickly inspect the hyperparameter combinations that yield models that work +# well enough without spending too much computational resources. Feel free to +# try more interations on your own. # # Once the `RandomizedSearchCV` has found the best set of hyperparameters, it # uses them to refit the model using the full training set. To estimate the