In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}

# Create model
dt = DecisionTreeClassifier(random_state=42)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Best model
best_dt = grid_search.best_estimator_
best_dt

## Cost-Complexity Pruning Approach

In [None]:
# Generate pruning path
path = dt.cost_complexity_pruning_path(X_train, y_train)
alphas = path['ccp_alphas']

# Test different alpha values
dt_scores = []
for alpha in alphas:
    dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
    dt_pruned.fit(X_train, y_train)
    dt_scores.append(dt_pruned.score(X_val, y_val))

# Plot alpha vs accuracy to find optimal value
import matplotlib.pyplot as plt
plt.plot(alphas, dt_scores)
plt.xlabel('Alpha')
plt.ylabel('Accuracy')

## Visualization for Tuning
Visualizing tree structure and performance metrics helps with parameter selection:

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt

# Plot tree
plt.figure(figsize=(20,10))
tree.plot_tree(dt, filled=True, feature_names=feature_names, class_names=class_names)

# Plot learning curves to detect overfitting
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
    dt, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Cross-validation score')