diff --git a/python_scripts/trees.py b/python_scripts/trees.py index 75d295a2e..984aa28dc 100644 --- a/python_scripts/trees.py +++ b/python_scripts/trees.py @@ -1,28 +1,28 @@ # %% [markdown] # # Decision tree in depth # -# In this notebook, we will go into details on the internal algorithm used to -# build the decision tree. First, we will focus on the decision tree used for -# classification. Then, we will highlight the fundamental difference between -# decision tree used in classification and in regression. Finally, we will -# quickly discuss the importance of the hyperparameters to be aware of when +# In this notebook, we will discuss in detail the internal algorithm used to +# build the decision tree. First, we will focus on the classification decision +# tree. Then, we will highlight the fundamental difference between the +# decision tree used for classification and regression. Finally, we will +# quickly discuss the importance of the hyper-parameters to be aware of when # using decision trees. # # ## Presentation of the dataset # -# We use the +# We will use the # [Palmer penguins dataset](https://allisonhorst.github.io/palmerpenguins/). -# This dataset is composed of penguins records and ultimately, we want to -# identify from which specie a penguin belongs to. +# This dataset is comprised of penguin records and ultimately, we want to +# predict the species each penguin belongs to. # -# A penguin is from one of the three following species: Adelie, Gentoo, and -# Chinstrap. See the illustration below depicting of the three different bird +# Each penguin is from one of the three following species: Adelie, Gentoo, and +# Chinstrap. See the illustration below depicting the three different penguin # species: # # ![Image of penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png) # -# This problem is a classification problem since the target is made of -# categories. We will limit our input data to a subset of the original features +# This problem is a classification problem since the target is categorical. +# We will limit our input data to a subset of the original features # to simplify our explanations when presenting the decision tree algorithm. # Indeed, we will use feature based on penguins' culmen measurement. You can # learn more about the penguins' culmen with illustration below: @@ -48,7 +48,7 @@ data.info() # %% [markdown] -# We can observe that they are 2 missing records in this dataset and for the +# We can observe that there are 2 missing records in this dataset and for the # sake of simplicity, we will drop the records corresponding to these 2 # samples. @@ -57,7 +57,7 @@ data.info() # %% [markdown] -# We will separate the target from the data and we will create a training and a +# We will separate the target from the data and create a training and a # testing set. # %% @@ -69,7 +69,7 @@ ) # %% [markdown] -# Before going into details in the decision tree algorithm, we will quickly +# Before going into detail about the decision tree algorithm, we will quickly # inspect our dataset. # %% @@ -81,19 +81,19 @@ # We can first check the feature distributions by looking at the diagonal plots # of the pairplot. We can build the following intuitions: # -# * The Adelie specie is separable from the Gentoo and Chinstrap species using +# * The Adelie species is separable from the Gentoo and Chinstrap species using # the culmen length; -# * The Gentoo specie is separable from the Adelie and Chinstrap species using +# * The Gentoo species is separable from the Adelie and Chinstrap species using # the culmen depth. # -# ## How decision tree are built? +# ## How are decision tree built? # # In a previous notebook, we learnt that a linear classifier will define a # linear separation to split classes using a linear combination of the input # features. In our 2-dimensional space, it means that a linear classifier will -# defined some oblique lines that best separate our classes. We define a -# function below that given a set of data point and a classifier will plot the -# decision boundaries learnt by the classifier. +# define some oblique lines that best separate our classes. We define a +# function below that, given a set of data points and a classifier, will plot +# the decision boundaries learnt by the classifier. # %% import numpy as np @@ -135,7 +135,8 @@ def plot_decision_function(X, y, clf, ax=None): # %% [markdown] # Thus, for a linear classifier, we will obtain the following decision -# boundaries. +# boundaries. These boundaries lines indicate where the model changes its +# prediction from one class to another. # %% from sklearn.linear_model import LogisticRegression @@ -146,7 +147,7 @@ def plot_decision_function(X, y, clf, ax=None): # %% [markdown] # We see that the lines are a combination of the input features since they are # not perpendicular a specific axis. In addition, it seems that the linear -# model would be a good candidate model for such problem, giving a good +# model would be a good candidate model for such problem as it gives good # accuracy. # %% @@ -156,10 +157,10 @@ def plot_decision_function(X, y, clf, ax=None): ) # %% [markdown] -# Unlike linear model, decision tree will partition the space considering a -# single feature at a time. Let's illustrate this behaviour by having -# a decision tree which makes a single split to partition the feature space. -# the decision tree to make a single split to partition our feature space. +# Unlike linear models, decision trees will partition the space by considering +# a single feature at a time. Let's illustrate this behaviour by having +# a decision tree that only makes a single split to partition the feature +# space. # %% from sklearn.tree import DecisionTreeClassifier @@ -168,12 +169,13 @@ def plot_decision_function(X, y, clf, ax=None): plot_decision_function(X_train, y_train, tree) # %% [markdown] -# The partition found separate the data along the axis "Culmen Length", +# The partitions found by the algorithm separates the data along the axis +# "Culmen Length", # discarding the feature "Culmen Depth". Thus, it highlights that a decision # tree does not use a combination of feature when making a split. # -# However, such a split is not powerful enough to isolate the three species and -# the model accuracy is low compared to the linear model. +# However, such a split is not powerful enough to separate the three species +# and the model accuracy is low when compared to the linear model. # %% print( @@ -182,13 +184,13 @@ def plot_decision_function(X, y, clf, ax=None): ) # %% [markdown] -# Indeed, it is not a surprise. We earlier saw that a single feature will not -# help separating the three species. However, from the previous analysis we -# saw that using both features should be useful to get fairly good results. -# Considering the mechanism of the decision tree illustrated above, we should -# repeat the partitioning on each rectangle that was previously created. In -# this regard, we expect that the partition will be using the feature "Culmen -# Depth" this time. +# Indeed, it is not a surprise. We saw earlier that a single feature will not +# be able to separate all three species. However, from the previous analysis we +# saw that by using both features we should be able to get fairly good results. +# Considering the splitting mechanism of the decision tree illustrated above, we should +# repeat the partitioning on the resulting rectangles created by the first +# split. In this regard, we expect that the two partitions at the second level of the tree will be using +# the feature "Culmen Depth". # %% tree.set_params(max_depth=2) @@ -209,13 +211,13 @@ def plot_decision_function(X, y, clf, ax=None): # At this stage, we have the intuition that a decision tree is built by # successively partitioning the feature space, considering one feature at a # time. -# Subsequently, we will present the details regarding the partitioning +# Subsequently, we will present the details of the partitioning # mechanism. # # ## Partitioning mechanism # -# Let's isolate a single feature. We will present the mechanism allowing to -# find the optimal partition for these one-dimensional data. +# Let's isolate a single feature. We will present the mechanism allowing us to +# find the optimal partitions for this one-dimensional data. # %% single_feature = X_train["Culmen Length (mm)"] @@ -235,8 +237,8 @@ def plot_decision_function(X, y, clf, ax=None): _ = plt.ylabel('Class probability') # %% [markdown] -# Seeing this graph, we can easily separate the Adelie specie from -# the other species. Alternatively, we can have a scatter plot of all +# Seeing this graph, we can easily separate the Adelie species from +# the other species. This can also been seen on a scatter plot of all the # samples. # %% @@ -248,9 +250,9 @@ def plot_decision_function(X, y, clf, ax=None): _ = sns.swarmplot(x=single_feature.name, y="", hue=y_train.name, data=df) # %% [markdown] -# Finding a split comes to define a threshold value which will be used to +# Finding a split requires us to define a threshold value which will be used to # separate the different classes. To give an example, we will pick a random -# threshold value and we will qualify the quality of the split. +# threshold value and we will quantify the quality of the split. # %% rng = np.random.RandomState(0) @@ -265,14 +267,14 @@ def plot_decision_function(X, y, clf, ax=None): _ = ax.set_title(f"Random threshold value: {threshold_value} mm") # %% [markdown] -# A random split does not ensure that we pick up a threshold value which -# best separate the species. Thus, an intuition will be to find a -# threshold value that best divide the Adelie class from other classes. A +# A random split does not ensure that we pick a threshold value that +# best separates the species. Thus, an intuition would be to find a +# threshold value that best divides the Adelie class from other classes. A # threshold around 42 mm would be ideal. Once this split is defined, we could # specify that the sample < 42 mm would belong to the class Adelie and the -# samples > 42 mm would belong to the class the most probable (the most -# represented in the partition) between the Gentoo and the Chinstrap. In this -# case, it seems to be the Gentoo specie, which is in-line with what we +# samples > 42 mm would belong to the class the most probable (the one most +# represented in the partition). In this +# case, it seems to be the Gentoo species, which is in line with what we # observed earlier when fitting a `DecisionTreeClassifier` with a # `max_depth=1`. @@ -295,11 +297,11 @@ def plot_decision_function(X, y, clf, ax=None): # ### The split purity criterion # # To evaluate the effectiveness of a split, we will use a criterion to qualify -# the class purity on the different partitions. +# the class purity on the resulting partitions. # # First, let's define a threshold at 42 mm. Then, we will divide the data into # 2 sub-groups: a group for samples < 42 mm and a group for samples >= 42 mm. -# Then, we will store the class label for these samples. +# Finally, we will store the class label for these samples. # %% threshold_value = 42 @@ -309,8 +311,8 @@ def plot_decision_function(X, y, clf, ax=None): # %% [markdown] # We can check the proportion of samples of each class in both partitions. This -# proportion is the probability of each class when considering -# the partition. +# proportion is the probability of each class when considering the samples +# in the partition. # %% labels_below_threshold.value_counts(normalize=True).sort_index() @@ -320,28 +322,39 @@ def plot_decision_function(X, y, clf, ax=None): # %% [markdown] -# As we visually assess, the partition defined by < 42 mm has mainly Adelie -# penguin and only 2 samples which we could considered misclassified. However, -# on the partition >= 42 mm, we cannot differentiate Gentoo and Chinstrap -# (while they are almost twice more Gentoo). +# As we visually assessed, the partition (i.e. the part of the data) +# defined by < 42 mm has mainly Adelie +# penguin and only 2 samples that are misclassified. However, +# in the partition >= 42 mm, we cannot differentiate well between Gentoo and +# Chinstrap (while they are almost twice more Gentoo). # -# We should come with a statistical measure which combine the class -# probabilities together that can be used as a criterion to qualify the purity -# of a partition. We will choose as an example the entropy criterion (also used +# We should use a statistical measure that uses all the class +# probabilities, as the criterion to qualify the purity +# of a partition. +# We will choose as an example the entropy criterion (also used # in scikit-learn) which is one of the possible classification criterion. # -# The entropy is defined as: $H(X) = - \sum_{k=1}^{K} p(X_k) \log p(X_k)$ +# The entropy $H$ of the data remaining in one partition is defined as: # -# For a binary problem, the entropy function for one of the class can be -# depicted as follows: +# $H = - \sum_{k=1}^{K} p_k \log p_k$ +# +# where $p_k$ stands for the probability (here the proportions) +# of finding the class $k$ in this part. +# +# For a binary problem (e.g., only 2 classes of penguins), the entropy function +# for one of the class can be depicted as follows: # # ![title](https://upload.wikimedia.org/wikipedia/commons/2/22/Binary_entropy_plot.svg) # -# Therefore, the entropy will be maximum when the proportion of sample from -# each class will be equal and minimum when only samples for a single class -# is present. +# Therefore, the entropy will be maximum when the proportion of samples from +# each class is equal (i.e. $p_k$ is 50%) and minimum when only samples for +# a single class is present (i.e., $p_k$ is 100%, only class `X`, +# or 0%, only the other class). This idea can be extended to >2 classes. +# For example, for 3 classes, entropy would be highest when the proportion of +# samples is 33% for all 3 classes and lowest when the proportion of only one +# of the classes is 100%. # -# Therefore, one searches to minimize the entropy in each partition. +# Therefore, a good partition *minimizes* the entropy in each part. # %% def classification_criterion(labels): @@ -362,24 +375,30 @@ def classification_criterion(labels): # %% [markdown] # In our case, we can see that the entropy in the partition < 42 mm is close to -# 0 meaning that this partition is "pure" and contain a single class while -# the partition >= 42 mm is much higher due to the fact that 2 of the classes -# are still mixed. +# 0, meaning that this partition is "pure" and nearly entirely consists of a +# single class (Adelie). Conversely, the partition >= 42 mm is much higher +# because the species are still mixed, with large numbers of both Chinstrap +# and Gentoo penguins. # -# Now, we are able to assess the quality of each partition. However, the -# ultimate goal is to evaluate the quality of the split and thus combine both -# measures of entropy to obtain a single statistic. +# With entropy, we are able to assess the quality of each partition. However, +# the ultimate goal is to evaluate the quality of the overall split +# and thus +# combine the measures of entropy in each partition (leaf) into a single statistic. # # ### Information gain # -# This statistic is known as the information gain. It combines the entropy of -# the different partitions to give us a single statistic qualifying the quality -# of a split. The information gain is defined as the difference of the entropy -# before making a split and the sum of the entropies of each partition, -# normalized by the frequencies of class samples on each partition. The goal is -# to maximize the information gain. -# -# We will define a function to compute the information gain given the different +# Information gain uses the entropy of +# the two partitions to give us a single statistic quantifying the quality +# of a split. The information gain is defined as the difference between the +# entropy +# before a split and the sum of the entropies of each partition, +# normalized by the frequencies of class samples in each partition. +# +# IG = H(X_unsplit)/N - ( H(split1)/N1 + H(split2)/N2 ) +# +# The goal is to maximize the information gain (i.e. maximize the decrease in entropy after the split). +# +# We will define a function to compute the information gain given the # partitions. # %% @@ -418,8 +437,8 @@ def information_gain(labels_below_threshold, labels_above_threshold): # %% [markdown] -# Now, we are able to quantify any split. Thus, we can evaluate every possible -# split and compute the information gain for each split. +# Now that we are able to quantify any split, we can evaluate all possible +# splits and compute the information gain for each split. # %% splits_information_gain = [] @@ -439,7 +458,8 @@ def information_gain(labels_below_threshold, labels_above_threshold): # %% [markdown] # As previously mentioned, we would like to find the threshold value maximizing -# the information gain. +# the information gain. Below we draw a line in the plot, where the maximum +# information gain value is. # %% best_threshold_indice = np.argmax(splits_information_gain) @@ -453,10 +473,10 @@ def information_gain(labels_below_threshold, labels_above_threshold): ax.set_title(f"Best threshold: {best_threshold_value} mm") # %% [markdown] -# By making this brute-force search, we find that the threshold maximizing the +# Using this brute-force search, we find that the threshold maximizing the # information gain is 43.3 mm. # -# Let's check if this results is similar than the one found with the +# Let's check if this result is similar to the one found with the # `DecisionTreeClassifier` from scikit-learn. # %% @@ -469,16 +489,19 @@ def information_gain(labels_below_threshold, labels_above_threshold): # %% [markdown] # The implementation in scikit-learn gives similar results: 43.25 mm. The -# slight difference are only due to some low-level implementation details. +# slight difference is just due to some low-level implementation details. # # As we previously explained, the split mechanism will be repeated several -# times (until we don't have any classification error on the training set). In -# the above example, it corresponds to increasing the `max_depth` parameter. +# times (until there is no classification error on the training set, +# i.e., all final partitions consist of only one class). In +# the above example, it corresponds to setting the `max_depth` parameter to +# `None`. This allows the algorithm to keep making splits until the final +# partitions are pure. # -# ## How prediction works? +# ## How does prediction work? # -# We showed the way a decision tree is constructed. However, we did not explain -# how and what will be predicted from the decision tree. +# We showed how a decision tree is constructed. However, we did not explain +# how predictions are makde from the decision tree. # # First, let's recall the tree structure that we fitted earlier. @@ -488,8 +511,7 @@ def information_gain(labels_below_threshold, labels_above_threshold): # %% [markdown] # We recall that the threshold found is 43.25 mm. Thus, let's see the class # prediction for a sample with a feature value below the threshold and another -# above the -# threshold. +# above the threshold. # %% print(f"The class predicted for a value below the threshold is: " @@ -498,27 +520,27 @@ def information_gain(labels_below_threshold, labels_above_threshold): f"{tree.predict([[45]])}") # %% [markdown] -# We predict an Adelie penguin for a value below the threshold which is not -# surprising since this partition was almost pure. In the other case we -# predicted the Gentoo penguin. Indeed, we predict the class the -# most probable. +# We predict an Adelie penguin if the feature value is below the threshold, +# which is not surprising since this partition was almost pure. If the feature +# value is above the threshold, we +# predict the Gentoo penguin, the class that is most probable. # # ## What about decision tree for regression? # -# We explained the construction of the decision tree in a classification -# problem. The entropy criterion to split the nodes used the class -# probabilities. Thus, this criterion is not adapted when the target `y` is -# continuous. In this case, we will need specific criterion adapted to +# We explained the construction of the decision tree for a classification +# problem. The entropy criterion to determine how we split the nodes used the +# class probabilities. We cannot use this criterion the target `y` is +# continuous. In this case, we will need specific criterion adapted for # regression problems. # -# Before going into details with regression criterion, let's observe and -# build some intuitions on the characteristics of decision tree used -# in regression. +# Before going into detail about regression criterion, let's observe and +# build some intuitions about the characteristics of decision trees used +# for regression. # # ### Decision tree: a non-parametric model # -# We use the same penguins dataset. However, this time we will formulate a -# regression problem instead of a classification problem. Thus, we will try to +# We will use the same penguins dataset however, this time we will formulate a +# regression problem instead of a classification problem. We will try to # infer the body mass of a penguin given its flipper length. # %% @@ -542,10 +564,10 @@ def information_gain(labels_below_threshold, labels_above_threshold): # Here, we deal with a regression problem because our target is a continuous # variable ranging from 2.7 kg to 6.3 kg. From the scatter plot above, we can # observe that we have a linear relationship between the flipper length -# and the body mass. Longer is the flipper of a penguin, heavier will be the +# and the body mass. The longer the flipper of a penguin, the heavier the # penguin. # -# For this problem, we would expect the simpler linear model to be able to +# For this problem, we would expect the simple linear model to be able to # model this relationship. # %% @@ -556,7 +578,7 @@ def information_gain(labels_below_threshold, labels_above_threshold): # %% [markdown] # We will first create a function in charge of plotting the dataset and # all possible predictions. This function is equivalent to the earlier -# function used for classification. +# function used to plot the decision boundaries for classification. # %% def plot_regression_model(X, y, model, extrapolate=False, ax=None): @@ -598,8 +620,8 @@ def plot_regression_model(X, y, model, extrapolate=False, ax=None): # %% [markdown] # On the plot above, we see that a non-regularized `LinearRegression` is able -# to fit the data. The specificity of the model is that any new predictions -# will occur on the line. +# to fit the data. A feature of this model is that all new predictions +# will be on the line. # %% X_test_subset = X_test[:10] @@ -612,10 +634,10 @@ def plot_regression_model(X, y, model, extrapolate=False, ax=None): plt.legend() # %% [markdown] -# On the contrary of linear model, decision trees are non-parametric -# models, so they do not rely on the way data should be distributed. In this -# regard, it will affect the prediction scheme. Repeating the -# above experiment will highlights the differences. +# Contrary to linear models, decision trees are non-parametric +# models, so they do not make assumptions about the way data are distributed. +# This will affect the prediction scheme. Repeating the +# above experiment will highlight the differences. # %% from sklearn.tree import DecisionTreeRegressor @@ -626,16 +648,16 @@ def plot_regression_model(X, y, model, extrapolate=False, ax=None): _ = plot_regression_model(X_train, y_train, tree) # %% [markdown] -# We see that the decision tree model does not have a priori and do not end-up -# with a straight line to regress flipper length and body mass. The prediction -# of a new sample, which was already present in the training set, will give the -# same target than this training sample. However, having different body masses +# We see that the decision tree model does not have a priori distribution +# for the data and we do not end-up +# with a straight line to regress flipper length and body mass. +# Having different body masses # for a same flipper length, the tree will be predicting the mean of the # targets. # # So in classification setting, we saw that the predicted value was the most # probable value in the node of the tree. In the case of regression, the -# predicted value corresponds to the mean of the target in the node. +# predicted value corresponds to the mean of the target in the leaf. # # This lead us to question whether or not our decision trees are able to # extrapolate to unseen data. We can highlight that this is possible with the @@ -645,38 +667,41 @@ def plot_regression_model(X, y, model, extrapolate=False, ax=None): plot_regression_model(X_train, y_train, linear_model, extrapolate=True) # %% [markdown] -# The linear model will extrapolate using the fitted model for flipper length -# < 175 mm and > 235 mm. Let's see the difference with the trees. +# The linear model will extrapolate using the fitted model for flipper lengths +# < 175 mm and > 235 mm. Let's see the difference between the classification +# and regression trees. # %% ax = plot_regression_model(X_train, y_train, linear_model, extrapolate=True) _ = plot_regression_model(X_train, y_train, tree, extrapolate=True, ax=ax) # %% [markdown] -# For the tree, we see that it cannot extrapolate below and above the minimum -# and maximum, respectively, of the flipper length encountered during the -# training. Indeed, we are predicting the minimum and maximum values of the -# training set. +# For the regression tree, we see that it cannot extrapolate outside of the +# flipper length range present in the training data. +# For flipper lengths below the minimum, the mass of the penguin in the +# training data with the shortest flipper length will always be predicted. +# Similarly, for flipper lengths above the maximum, the mass of the penguin +# in the training data with the longest flipper will always predicted. # # ### The regression criterion # # In the previous section, we explained the differences between using decision -# tree in classification or in regression: the predicted value will be the +# tree for classification and for regression: the predicted value will be the # most probable class for the classification case while the it will be the mean # in the case of the regression. The second difference that we already # mentioned is the criterion. The classification criterion cannot be applied # in regression setting and we need to use a specific set of criterion. # # One of the criterion that can be used in regression is the mean squared -# error. In this case, we will compute this criterion in each partition -# as in the case of the entropy and select the split leading to the best +# error. In this case, we will compute this criterion for each partition, +# as in the case of the entropy, and select the split leading to the best # improvement (i.e. information gain). # # ## Importance of decision tree hyper-parameters on generalization # # This last section will illustrate the importance of some key hyper-parameters -# of the decision tree. We will both illustrate it on classification and -# regression datasets that we previously used. +# of the decision tree. We will illustrate it on both the classification and +# regression probelms that we previously used. # # ### Creation of the classification and regression dataset # @@ -730,9 +755,9 @@ def plot_regression_model(X, y, model, extrapolate=False, ax=None): # %% [markdown] # ### Effect of the `max_depth` parameter # -# In decision tree, the most important parameter to get a trade-off between +# In decision trees, the most important parameter to get a trade-off between # under-fitting and over-fitting is the `max_depth` parameter. Let's build -# a shallow tree (for both classification and regression) and a deeper tree. +# a shallow tree and then deeper tree (for both classification and regression). # %% @@ -757,11 +782,12 @@ def plot_regression_model(X, y, model, extrapolate=False, ax=None): _ = fig.suptitle(f"Deep tree with a max-depth of {max_depth}") # %% [markdown] -# In both classification and regression setting, we can observe that increasing -# the depth will make the tree model more expressive. However, a tree which is -# too deep will overfit the training data, creating partitions which will only -# be correct for "outliers". The `max_depth` is one of the parameter that one -# would like to optimize via cross-validation and a grid-search. +# For both classification and regression setting, we can observe that +# increasing +# the depth will make the tree model more expressive. However, a tree that is +# too deep will overfit the training data, creating partitions which are only +# be correct for "outliers". The `max_depth` is one of the hyper-parameters +# that one should optimize via cross-validation and grid-search. # %% from sklearn.model_selection import GridSearchCV