This example demonstrates the use of sklearn library to
1. train a tree using greedy algorithm
2. prune the tree by specifying alpha for the regularization
3. Find the best alpha with cross validation and visualization

Part I: Load iris data and simply train a tree with specified alpha for tree pruning

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target # X: (150,4), y: (150,)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(, , , random_state =42)

# Create a decision tree classifier with cost-complexity pruning (ccp_alpha)
# You can change the value of ccp_alpha to observe different levels of pruning
tree_classifier =
# Train the classifier on the training data
tree_classifier.

# Make predictions on the test data
y_pred = tree_classifier.

# Calculate the accuracy of the predictions
accuracy = accuracy_score(
print(f"Accuracy: { * 100:.2f}%")


Part II: Use cross validation error as criteria to find the best alpha by grid search

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
# Define the parameter grid for ccp_alpha
param_grid = {'ccp_alpha': np.linspace( ,'min_samples_split': range(}

# Create a GridSearchCV object
# remember tree_classifier = DecisionTreeClassifier(ccp_alpha=0.04, random_state=42) above
grid_search = GridSearchCV(

# Fit the model to the training data
grid_search.

# Get all the results, which is a dictionary in Python
# results is a dictionary: include key 'param_ccp_alpha' and 'mean_test_score', each having 2400 values
results = grid_search.

# Find the best score which is the accuracy
best_score = grid_search.

# Find all ccp_alphas with the best score
# First fine the highest mean_test_score that matches the best score found
# Then find the corresponding alpha
best_ccp_alphas = results[

# Select the largest ccp_alpha with the best score
best_ccp_alpha =
print(f"Best ccp_alpha: { }")

best_min_samples_split = grid_search.
print(f"Best min_samples_split: {}")

# Evaluate the model on the test data
test_accuracy = grid_search.s
print(f"Test accuracy with best ccp_alpha: {* 100:.2f}%")

# Fit a new tree with the best ccp_alpha to find the tree size
best_tree = DecisionTreeClassifier(random_state=0,
best_tree

# Print the size of the best tree
print(f"Size of the best tree (number of leaves): {best_tree.}")
print(f"Size of the best tree (number of nodes): {best_tree.}")




Then, we visualize how tree size is related to CV_error and Test_error

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score

# Get the cost complexity pruning path
# Since we do not know the ccp_alpha now, we start with a simple tree model
tree_classifier =
# computes the "effective alphas" and the corresponding impurities for each step of the pruning process.
# The number of alphas will be equal to the number of nodes that can be pruned, plus one for the unpruned tree.
# The exact number depends on the structure and depth of the tree determined by the data characteristics
path = tree_classifier.cost
# impurity" is a measure used to decide how to split the data at each node. Common measures of
# impurity include the Gini impurity for classification trees and mean squared error for regression trees
ccp_alphas, impurities = path., path. # ccp_alphas or path.impurities has a size of (K,),

# Perform cross-validation for each alpha and get the number of leaves
cv_errors = []
test_errors = []
tree_sizes = []
# Let's loop for alpha based on the ccp_alphas found from cost complexity pruning above
for ccp_alpha in ccp_alphas:
    clf =DecisionTreeClassifier(random_state = 0, )
    clf.f  # Fit the model to get the tree size
    tree_sizes.  # Get the number of leaves J
    # the classifier's performance is evaluated on the test set (X_test, y_test) using the score method
    test_errors.append(  # Test error
    # Below is for tree size selection based on CV
    # the error is 1 - accuracy. This gives us the proportion of incorrect predictions over the cross-validation process
    # Then trains the model on 4 folds, and evaluates it on the 1 remaining fold.
    # This process is repeated 5 times, each time with a different fold held out for validation.
    # The function returns the accuracy scores for each fold
    scores = cross_val_score(clf,  # 5-fold cross-validation
    # Record error (1- accuracy or score) for each alpha
    cv_errors.append(  # CV error to be used to determine alpha or tree size


# Plotting
plt.figure(figsize=(10, 6))
plt.plot(tree_sizes,, marker='o', label='CV Error', color='blue')
plt.plot(,, marker='o', label='Test Error', color='orange')
plt.xlabel("Tree Size (Number of Leaves)")
plt.ylabel("Misclassification Error")
plt.title("Tree Size vs. Misclassification Error")
plt.legend()
plt.show()


In [None]:
# Question: Based on the curve above, which tree size should be used?

