# Decision Tree Classifier

Gini is used in decision tree classification as a measure of how well a node or a split can separate the data into different classes. The lower the Gini value, the better the separation. The Gini value is calculated as:

$$
Gini = 1 - \sum_{i=1}^k p_i^2
$$

where $k$ is the number of classes and $p_i$ is the probability of choosing an element of class $i$. The Gini value ranges from 0 to 0.5, where 0 means the node is pure (all elements belong to the same class) and 0.5 means the node is impure (equal probability of choosing any class).

To use Gini in decision tree classification, the algorithm compares the Gini values of different possible splits and chooses the one that minimizes the Gini value. This means that the algorithm tries to find the best feature and the best threshold to divide the data into two subsets, such that the subsets are more pure than the original node. The algorithm repeats this process recursively until all the nodes are pure or some stopping criteria are met.

## Decision Tree Example 1

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Use a custom style for the plot (adjust the path to your style file)
plt.style.use('https://raw.githubusercontent.com/HatefDastour/ENSF444/main/Files/mystyle.mplstyle')

def print_bold(txt, c=31):
    """
    Function to print text in bold with specified color.

    Parameters:
    - txt (str): Text to be printed.
    - c (int): Color code for the printed text.

    Returns:
    None
    """
    print(f"\033[1;{c}m" + txt + "\033[0m")

def format_confusion_matrix(cm, title):
    true_neg, false_pos, false_neg, true_pos = cm.ravel()
    result = f"\033[1m{title} Set Confusion Matrix\033[0m:\n"
    result += f"- {true_neg} instances were correctly predicted as class 0.\n"
    result += f"- {true_pos} instances were correctly predicted as class 1.\n"
    result += f"- {false_pos} instance was incorrectly predicted as class 1 when it was actually class 0.\n"
    result += f"- {false_neg} instances were incorrectly predicted as class 0 when they were actually class 1.\n"

    return result

def plot_cm(model, X_train, X_test, y_train, y_test, class_names, figsize=(7, 4)):
    # Create a figure and axes for displaying confusion matrices side by side
    fig, ax = plt.subplots(1, 2, figsize=figsize)

    datasets = [(X_train, y_train, 'Train'), (X_test, y_test, 'Test')]

    for i in range(2):
        X, y, dataset_name = datasets[i]

        # Compute confusion matrix for the dataset predictions
        cm = confusion_matrix(y, model.predict(X))

        result = format_confusion_matrix(cm, dataset_name)
        print(result)

        # Create a ConfusionMatrixDisplay and plot it on the respective axis
        cm_display = ConfusionMatrixDisplay(cm, display_labels=class_names)\
                        .plot(ax=ax[i],
                              im_kw=dict(cmap='Greens' if dataset_name == 'Train' else 'Blues'),
                              text_kw={"size": 16}, colorbar=False)
        ax[i].set_title(f'{dataset_name} Data')
        ax[i].grid(False)

    # Add a super title for the entire figure
    fig.suptitle('Confusion Matrices', fontsize=16, weight = 'bold')

    # Adjust the layout for better spacing
    plt.tight_layout()
    
feature_names = [f'Feature_{i + 1}' for i in range(2)]

# Generate synthetic data using make_blobs
n_samples = 200
n_features = 2
centers = 2
cluster_std = 1.0
X, y = make_blobs(n_samples=n_samples, n_features=n_features,
                  centers=centers, random_state=0, cluster_std=cluster_std)
df = pd.DataFrame(data=X, columns=feature_names)
df['y'] = y
display(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0, stratify=y)

#  Plot decision boundaries
fig, axes = plt.subplots(1, 2, figsize=(9.5, 4.5))
dtc = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2, random_state=0)
dtc.fit(X_train, y_train)

# Define colors and colormap for the plot
colors = ["#f44336", "#2986cc"]

# Define a list of color names for the colormap
_cmap = ListedColormap(colors)
for ax, X_set, y_set in zip(axes, [X_train, X_test], [y_train, y_test]):
    DecisionBoundaryDisplay.from_estimator(dtc, X_set, cmap=_cmap,
                                           ax=ax, response_method="predict",
                                           plot_method="pcolormesh",
                                           xlabel= f'${feature_names[0]}$',
                                           ylabel= f'${feature_names[1]}$',
                                           shading="auto",
                                           grid_resolution=300,
                                           alpha=0.3)
    # Scatter plot for data points
    for num in np.unique(y_set):
        ax.scatter(X_set[:, 0][y_set == num], X_set[:, 1][y_set == num], c= colors[num],
                   s=40, edgecolors='k', marker='o', label=str(num), zorder=2)
    ax.legend(fontsize=12)
    ax.grid(False)

axes[0].set_title('Train Set', fontsize=14, weight='bold')
axes[1].set_title('Test Set', fontsize=14, weight='bold')

# Display Accuracy Score on the plot
accuracy_train = metrics.accuracy_score(dtc.predict(X_train), y_train)
accuracy_test = metrics.accuracy_score(dtc.predict(X_test), y_test)
# Print F1 values
txt = f'Accuracy Score (Train) = {accuracy_train:.3f}, Accuracy Score (Test) = {accuracy_test:.3f}'
print_bold('Accuracy')
print(f'\t{txt}')

# Adjust layout and display the plots
plt.tight_layout()

In [None]:
# Create a subplot for the tree plot
fig, ax = plt.subplots(1, 1, figsize=(9.5, 3))

# Plot the decision tree
_ = tree.plot_tree(dtc, ax=ax,
                   filled=True,
                   node_ids=True,
                   feature_names= feature_names,
                   proportion=True,
                   fontsize=12)

# Ensure tight layout
plt.tight_layout()

print_bold('Tree Plot')
print(tree.export_text(dtc, feature_names = feature_names, decimals = 3))

# Assuming you have your dataset X and labels y
# X is your feature matrix, and y is your target variable

# Access the feature importances, which correspond to Gini impurity
gini_importances = dtc.feature_importances_

# You can also compute the overall Gini impurity of a node or a tree
# For example, to calculate the Gini impurity of a specific node:
for node in range(len(dtc.tree_.value)):
    # print(dtc.tree_.value[node])
    node_gini = 1 - np.sum(np.square(dtc.tree_.value[node] / np.sum(dtc.tree_.value[node])))
    print(f'Gini for Node #{node} = {node_gini:.3f}')

print('\n')
plot_cm(dtc, X_train, X_test, y_train, y_test, class_names = ['0', '1'])
# del dtc

To simplify calculate these numbers using math, you need to understand how the Gini impurity is computed for each node in the decision tree. The Gini impurity is a measure of how likely a randomly chosen element from a set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the set. It is calculated as:

$$
Gini = 1 - \sum_{i=1}^k p_i^2
$$

where $k$ is the number of classes and $p_i$ is the probability of choosing an element of class $i$. The Gini impurity ranges from 0 to 0.5, where 0 means the set is perfectly pure (all elements belong to the same class) and 0.5 means the set is completely impure (equal probability of choosing any class).

The numbers in the brackets represent the number of samples of each class at each node. For example, [[80, 80]] means that there are 80 samples of class 0 and 80 samples of class 1 at node #0. To calculate the Gini impurity for node #0, you need to find the probabilities of choosing each class and plug them into the formula. The probabilities are:

$$
p_0 = \frac{80}{80 + 80} = 0.5
$$

$$
p_1 = \frac{80}{80 + 80} = 0.5
$$

Then, the Gini impurity is:

$$
Gini_0 = 1 - (0.5^2 + 0.5^2) = 0.5
$$

Similarly, for node #1, the probabilities are:

$$
p_0 = \frac{2}{2 + 78} = 0.025
$$

$$
p_1 = \frac{78}{2 + 78} = 0.975
$$

And the Gini impurity is:

$$
Gini_1 = 1 - (0.025^2 + 0.975^2) = 0.04875
$$

For node #2, the probabilities are:

$$
p_0 = \frac{78}{78 + 2} = 0.975
$$

$$
p_1 = \frac{2}{78 + 2} = 0.025
$$

And the Gini impurity is:

$$
Gini_2 = 1 - (0.975^2 + 0.025^2) = 0.04875
$$

What would be the output of the following (using the tree only)?

```dtc.predict(np.array([[0, 1]]))```

In [None]:
dtc.predict(np.array([[0, 1]]))

## Decision Tree Example 2


- **DecisionTreeClassifier** is a machine learning model that can be used for classification tasks, such as predicting the class label of a data point based on its features.

- **max_depth** is a parameter that controls the maximum depth of the tree, which is the longest path from the root node to a leaf node. A deeper tree can capture more complex patterns in the data, but it may also overfit and perform poorly on new data. Setting max_depth to None means that the tree will grow until all the nodes are pure or some other stopping criteria are met.

- **max_leaf_nodes** is a parameter that controls the maximum number of leaf nodes in the tree, which are the nodes that do not have any children. A smaller number of leaf nodes can reduce the complexity of the model and prevent overfitting, but it may also underfit and miss some important patterns in the data. Setting max_leaf_nodes to 4 means that the tree will have at most 4 leaf nodes, regardless of the depth.

- **random_state** is a parameter that controls the randomness of the model, such as the choice of the feature and the threshold to split each node. Setting random_state to 0 means that the model will use a fixed seed for the random number generator, which ensures that the results are reproducible.

In [None]:
# Plot decision boundaries
fig, axes = plt.subplots(1, 2, figsize=(9.5, 4.5))
dtc = DecisionTreeClassifier(max_depth=None, max_leaf_nodes= 4, random_state=0)
dtc.fit(X_train, y_train)

# Define colors and colormap for the plot
colors = ["#f44336", "#2986cc"]

# Define a list of color names for the colormap
_cmap = ListedColormap(colors)
for ax, X_set, y_set in zip(axes, [X_train, X_test], [y_train, y_test]):
    DecisionBoundaryDisplay.from_estimator(dtc, X_set, cmap=_cmap,
                                           ax=ax, response_method="predict",
                                           plot_method="pcolormesh",
                                           xlabel= f'${feature_names[0]}$',
                                           ylabel= f'${feature_names[1]}$',
                                           shading="auto",
                                           grid_resolution=300,
                                           alpha=0.3)
    # Scatter plot for data points
    for num in np.unique(y_set):
        ax.scatter(X_set[:, 0][y_set == num], X_set[:, 1][y_set == num], c= colors[num],
                   s=40, edgecolors='k', marker='o', label=str(num), zorder=2)
    ax.legend(fontsize=12)
    ax.grid(False)

axes[0].set_title('Train Set', fontsize=14, weight='bold')
axes[1].set_title('Test Set', fontsize=14, weight='bold')

# Display Accuracy Score on the plot
accuracy_train = metrics.accuracy_score(dtc.predict(X_train), y_train)
accuracy_test = metrics.accuracy_score(dtc.predict(X_test), y_test)
# Print F1 values
txt = f'Accuracy Score (Train) = {accuracy_train:.3f}, Accuracy Score (Test) = {accuracy_test:.3f}'
print_bold('Accuracy')
print(f'\t{txt}')

# Adjust layout and display the plots
plt.tight_layout()

In [None]:
# Create a subplot for the tree plot
fig, ax = plt.subplots(1, 1, figsize=(9.5, 5))

# Plot the decision tree
_ = tree.plot_tree(dtc, ax=ax,
                   filled=True,
                   node_ids=True,
                   feature_names= feature_names,
                   proportion=True,
                   fontsize=12)

# Ensure tight layout
plt.tight_layout()

print_bold('Tree Plot')
print(tree.export_text(dtc, feature_names = feature_names, decimals = 3))

# Assuming you have your dataset X and labels y
# X is your feature matrix, and y is your target variable

# Access the feature importances, which correspond to Gini impurity
gini_importances = dtc.feature_importances_

# You can also compute the overall Gini impurity of a node or a tree
# For example, to calculate the Gini impurity of a specific node:
for node in range(len(dtc.tree_.value)):
    # print(dtc.tree_.value[node])
    node_gini = 1 - np.sum(np.square(dtc.tree_.value[node] / np.sum(dtc.tree_.value[node])))
    print(f'Gini for Node #{node} = {node_gini:.3f}')

print('\n')
plot_cm(dtc, X_train, X_test, y_train, y_test, class_names = ['0', '1'])
del dtc

## Decision Tree Example 3

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def print_bold(txt, c=31):
    """
    Function to print text in bold with specified color.

    Parameters:
    - txt (str): Text to be printed.
    - c (int): Color code for the printed text.

    Returns:
    None
    """
    print(f"\033[1;{c}m" + txt + "\033[0m")

def format_confusion_matrix(cm, title):
    true_neg, false_pos, false_neg, true_pos = cm.ravel()
    result = f"\033[1m{title} Set Confusion Matrix\033[0m:\n"
    result += f"- {true_neg} instances were correctly predicted as class 0.\n"
    result += f"- {true_pos} instances were correctly predicted as class 1.\n"
    result += f"- {false_pos} instance was incorrectly predicted as class 1 when it was actually class 0.\n"
    result += f"- {false_neg} instances were incorrectly predicted as class 0 when they were actually class 1.\n"

    return result

def plot_cm(model, X_train, X_test, y_train, y_test, class_names, figsize=(7, 4)):
    # Create a figure and axes for displaying confusion matrices side by side
    fig, ax = plt.subplots(1, 2, figsize=figsize)

    datasets = [(X_train, y_train, 'Train'), (X_test, y_test, 'Test')]

    for i in range(2):
        X, y, dataset_name = datasets[i]

        # Compute confusion matrix for the dataset predictions
        cm = confusion_matrix(y, model.predict(X))

        result = format_confusion_matrix(cm, dataset_name)
        print(result)

        # Create a ConfusionMatrixDisplay and plot it on the respective axis
        cm_display = ConfusionMatrixDisplay(cm, display_labels=class_names)\
                        .plot(ax=ax[i],
                              im_kw=dict(cmap='Greens' if dataset_name == 'Train' else 'Blues'),
                              text_kw={"size": 16}, colorbar=False)
        ax[i].set_title(f'{dataset_name} Data')
        ax[i].grid(False)

    # Add a super title for the entire figure
    fig.suptitle('Confusion Matrices', fontsize=16, weight = 'bold')

    # Adjust the layout for better spacing
    plt.tight_layout()
    
feature_names = [f'Feature_{i + 1}' for i in range(2)]

# Generate synthetic data using make_blobs
n_samples = 500
n_features = 2
centers = 2
cluster_std = 3.0
X, y = make_blobs(n_samples=n_samples, n_features=n_features,
                  centers=centers, random_state=0, cluster_std=cluster_std)
df = pd.DataFrame(data=X, columns=feature_names)
df['y'] = y
display(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0, stratify=y)

#  Plot decision boundaries
fig, axes = plt.subplots(1, 2, figsize=(9.5, 4.5))
dtc = DecisionTreeClassifier(max_depth= None, max_leaf_nodes= None, random_state=0)
dtc.fit(X_train, y_train)

# Define colors and colormap for the plot
colors = ["#f44336", "#2986cc"]

# Define a list of color names for the colormap
_cmap = ListedColormap(colors)
for ax, X_set, y_set in zip(axes, [X_train, X_test], [y_train, y_test]):
    DecisionBoundaryDisplay.from_estimator(dtc, X_set, cmap=_cmap,
                                           ax=ax, response_method="predict",
                                           plot_method="pcolormesh",
                                           xlabel= f'${feature_names[0]}$',
                                           ylabel= f'${feature_names[1]}$',
                                           shading="auto",
                                           grid_resolution=300,
                                           alpha=0.3)
    # Scatter plot for data points
    for num in np.unique(y_set):
        ax.scatter(X_set[:, 0][y_set == num], X_set[:, 1][y_set == num], c= colors[num],
                   s=40, edgecolors='k', marker='o', label=str(num), zorder=2)
    ax.legend(fontsize=12)
    ax.grid(False)

axes[0].set_title('Train Set', fontsize=14, weight='bold')
axes[1].set_title('Test Set', fontsize=14, weight='bold')

# Display Accuracy Score on the plot
accuracy_train = metrics.accuracy_score(dtc.predict(X_train), y_train)
accuracy_test = metrics.accuracy_score(dtc.predict(X_test), y_test)
# Print F1 values
txt = f'Accuracy Score (Train) = {accuracy_train:.3f}, Accuracy Score (Test) = {accuracy_test:.3f}'
print_bold('Accuracy')
print(f'\t{txt}')

# Adjust layout and display the plots
plt.tight_layout()

# Regression Trees

## Regression Tree Example 1

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import train_test_split

# Generating synthetic data
np.random.seed(42)
n = 200
X = 2 * np.random.rand(n, 2)  # 2D input features
y = 3 * X[:, 0] + 5 * X[:, 1] + np.random.randn(n)  # Linear combination with noise

feature_names = [f'Feature_{i + 1}' for i in range(2)]
df = pd.DataFrame(data=X, columns=feature_names)
df['y'] = y
display(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Regressor with specific settings
dtr = DecisionTreeRegressor(criterion='squared_error',
                                 splitter='best', max_leaf_nodes= 2)

# Fit the Decision Tree Regressor to the data
_ = dtr.fit(X, y)

# Create a figure and axis for plotting the decision tree
fig, ax = plt.subplots(1, 1, figsize=(7, 2.75))

# Visualize the decision tree
_ = tree.plot_tree(dtr, ax=ax,
                   impurity=True,         # Show impurity in nodes
                   node_ids=True,         # Show node IDs
                   filled=True,           # Fill nodes with colors
                   feature_names=feature_names,  # Names of the features
                   proportion=True)       # Display class proportion

# Adjust the layout for better visualization
plt.tight_layout()

print(tree.export_text(dtr, feature_names = feature_names))

In [None]:
# Get the tree structure
tree_structure = dtr.tree_

# Extract feature indices and thresholds for each node
feature_indices = tree_structure.feature
thresholds = tree_structure.threshold

To get squared_error and value for node#1, we have,

In [None]:
# =============================================================================
# Node 1
# =============================================================================

# Define the thresholds
Feat2_threshold = thresholds[0]

# Node #01: Predict and Evaluate Subset
print('Node #1: Predict and Evaluate Subset')
# Import metrics from scikit-learn
from sklearn import metrics

# Predict the target variable for data points where 'Cylinders' is less than 5.5
y_hat = dtr.predict(X[X[:, 1] <= Feat2_threshold])

# Calculate the mean squared error (MSE) for the predicted values
mse = metrics.mean_squared_error(y_hat, y[X[:, 1] <= Feat2_threshold])

# Print the squared error (MSE) with three decimal places
print(f'squared_error = {mse:.3f}')

# Calculate the percentage of samples in this node compared to the entire dataset
sample_percentage = (len(y_hat) / len(y) * 100)

# Print the percentage of samples in this node
print(f'samples = {sample_percentage:.1f}%')

# Calculate and print the mean value of the predicted target variable in this node
mean_value_node1 = y_hat.mean()
print(f'value = {mean_value_node1:.3f}')

In [None]:
# =============================================================================
# Node 2
# =============================================================================

# Define the thresholds
Feat2_threshold = thresholds[0]

# Node #01: Predict and Evaluate Subset
print('Node #1: Predict and Evaluate Subset')
# Import metrics from scikit-learn
from sklearn import metrics

# Predict the target variable for data points where 'Cylinders' is less than 5.5
y_hat = dtr.predict(X[X[:, 1] > Feat2_threshold])

# Calculate the mean squared error (MSE) for the predicted values
mse = metrics.mean_squared_error(y_hat, y[X[:, 1] > Feat2_threshold])

# Print the squared error (MSE) with three decimal places
print(f'squared_error = {mse:.3f}')

# Calculate the percentage of samples in this node compared to the entire dataset
sample_percentage = (len(y_hat) / len(y) * 100)

# Print the percentage of samples in this node
print(f'samples = {sample_percentage:.1f}%')

# Calculate and print the mean value of the predicted target variable in this node
mean_value_node2 = y_hat.mean()
print(f'value = {mean_value_node1:.3f}')

In [None]:
# Create a figure and axis for the plot
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

# Define tick positions and limits for the plot
xlim = [-1, 3]
ylim = [0, 20]

# Create a scatter plot of data points
_ = ax.scatter(X[:, 1], y, marker='o',
               facecolor='DodgerBlue', edgecolor='Navy', alpha=0.3)

# Set labels, ticks, and limits for the axes
_ = ax.set(xlabel='Feature 2', ylabel='y', xlim=xlim, ylim=ylim)

# Add a vertical dashed line at x = 4.5
_ = ax.vlines(Feat2_threshold, ylim[0], ylim[1],
              linestyles='dashed', linewidth=2, colors='Black')

# Add horizontal dashed lines for mean values
_ = ax.hlines(mean_value_node1, xlim[0], Feat2_threshold,
              linestyles='dashed', linewidth=4, colors='Red',
              label = f'y = {mean_value_node1:.3f}')
_ = ax.hlines(mean_value_node2, Feat2_threshold, xlim[-1],
              linestyles='dashed', linewidth=4, colors='Green',
              label = f'y = {mean_value_node2:.3f}')

# Annotations for the regions
_ = ax.annotate(r'$\mathbf{R_1}$', xy=(0, 8), fontsize=30)
_ = ax.annotate(r'$\mathbf{R_2}$', xy=(2.2, 12), fontsize=30)

# Fill regions with different colors
_ = ax.fill_between([xlim[0], Feat2_threshold], ylim[0], ylim[1],
                    color='LimeGreen', alpha=0.1)
_ = ax.fill_between([Feat2_threshold, xlim[-1]], ylim[0], ylim[1],
                    color='purple', alpha=0.1)

# Add a grid to the plot
_ = ax.grid(True)
_ = ax.legend(fontsize = 12)
# Ensure a tight layout for the plot
plt.tight_layout()

## Regression Tree Example 2

These results show the performance of a Decision Tree Regressor (DTR) model with different values of the max_depth parameter on a training and a test dataset. The performance metric used is the Mean Squared Error (MSE), which is the average of the squared differences between the predicted and the actual values. A lower MSE indicates a better fit of the model to the data.

In [None]:
# This is an modified version of an example by Sklearn:
# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_dtrression.html

# Import the necessary modules and libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor

# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(160, 1), axis=0)
y = np.cos(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(32))

df = pd.DataFrame(data=X, columns=['x'])
df['y'] = y
display(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Predict
X_gen = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]

# Create decision tree dtrressors with different max depths
max_depths = [1, 2, 3, 4]
colors = ["DodgerBlue", "Violet", "OrangeRed", "Green"]

# Plot the results
fig, axes = plt.subplots(2, 2, figsize=(9.5, 8))
axes = axes.ravel()

for i, (depth, color) in enumerate(zip(max_depths, colors)):
    ax = axes[i]
    ax.scatter(X_train, y_train, s= 30, edgecolor="black", c="darkorange", label="data")
    
    dtrr = DecisionTreeRegressor(max_depth=depth).fit(X_train, y_train)
    y_pred = dtrr.predict(X_gen)
    ax.plot(X_gen, y_pred, color=color, linewidth=2)
    
    _ = ax.set_title(f"max_depth = {depth}", weight = 'bold')
    _ = ax.set(xlabel = "x", ylabel = "y")
    # Display Accuracy Score on the plot
    mse_train = metrics.mean_squared_error(dtrr.predict(X_train), y_train)
    mse_test = metrics.mean_squared_error(dtrr.predict(X_test), y_test)
    # Print F1 values
    txt = f'MSE (Training) = {mse_train:.3f}, MSE (Test) = {mse_test:.3f}'
    print_bold(f'DTR (max_depth = {depth})')
    print(f'\t{txt}')
fig.suptitle('Training Data', weight = 'bold')
plt.tight_layout()