# Gradient Boosted Decision Trees

## Gradient Boosting Regressor

<font color='Blue'><b>Example</b></font>. The Auto MPG dataset retrieved from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/dataset/9/auto+mpg).

In [None]:
try:
    from ucimlrepo import fetch_ucirepo
except ImportError:
    !pip3 install -U ucimlrepo
    from ucimlrepo import fetch_ucirepo
import numpy as np

# fetch dataset
auto_mpg = fetch_ucirepo(name = 'Auto MPG')

# data (as pandas dataframes)
X = auto_mpg.data.features
y = auto_mpg.data.targets

# drop rows with missing values from X
X = X.dropna(axis=0, how='any')

# align X and y by index
X, y = X.align(y, join='inner', axis=0)

# ln(mpg)
y = np.log(y['mpg'])
y.name = 'ln(mpg)'
print('X:')
display(X)
print('\ny:')
print(y)
print('\nInfo:')
X.info()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Create a DataFrame to display the sizes of the training and testing sets
set_size_df = pd.DataFrame({'Size': [len(X_train), len(X_test)]}, index=['Train', 'Test'])
display(set_size_df.T)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn import metrics
try:
  import sklearnex
except ImportError:
  !pip install pip install scikit-learn-intelex
  import sklearnex
from IPython.display import clear_output
clear_output()
sklearnex.patch_sklearn()

random_state = 0

plt.style.use('https://raw.githubusercontent.com/HatefDastour/ENSF444/main/Files/mystyle.mplstyle')

# Create a figure and subplots
fig, ax = plt.subplots(2, 2, figsize=(9.5, 9.5))
feature_set_labels = ["""Using sklearn's\nGradientBoostingRegressor""",
                      """Using xgboost's\nXGBRegressor"""]

reg_gb = GradientBoostingRegressor(random_state = random_state, n_estimators = 100, learning_rate = 0.1)
reg_gb.fit(X_train, y_train)

reg_xgb = xgb.XGBRegressor(random_state = random_state, n_estimators = 100, learning_rate = 0.1)
reg_xgb.fit(X_train, y_train)

# Loop through different feature sets
for i, reg in enumerate([reg_gb, reg_xgb]):
    # Train set
    y_pred_train = reg.predict(X_train)
    ax[i, 0].scatter(y_train, y_pred_train,
                     facecolors='#a7e0f7', edgecolors='#191970', alpha=0.8)
    ax[i, 0].plot([0, 1], [0, 1], '--k', lw=2, transform=ax[i, 0].transAxes)
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    txt_train = f'MSE (Train) = {mse_train:.2e}'
    text_train = ax[i, 0].text(0.45, 0.05, txt_train,
                               transform=ax[i, 0].transAxes, fontsize=11, weight='bold',
                               bbox=dict(facecolor='Whitesmoke', alpha=0.7))
    ax[i, 0].set(ylabel='Predicted Values', xlabel='Actual Values')
    ax[i, 0].set_title(f'{feature_set_labels[i]} (Train)', fontsize=14, weight='bold')
    ax[i, 0].axis('equal')

    # Test set
    y_pred_test = reg.predict(X_test)
    ax[i, 1].scatter(y_test, y_pred_test,
                     facecolors='#9ac989', edgecolors='#217304', alpha=0.8)
    ax[i, 1].plot([0, 1], [0, 1], '--k', lw=2, transform=ax[i, 1].transAxes)
    mse_test = metrics.mean_squared_error(y_test, y_pred_test)
    txt_test = f'MSE (Test) = {mse_test:.2e}'
    text_test = ax[i, 1].text(0.45, 0.05, txt_test,
                              transform=ax[i, 1].transAxes, fontsize=11, weight='bold',
                              bbox=dict(facecolor='Whitesmoke', alpha=0.7))
    ax[i, 1].set(ylabel='Predicted Values', xlabel='Actual Values')
    ax[i, 1].set_title(f'{feature_set_labels[i]} (Test)', fontsize=14, weight='bold')
    ax[i, 1].axis('equal')

    # Print MSE values
    txt = f'MSE (Train) = {mse_train:.3e}, MSE (Test) = {mse_test:.3e}'
    print(feature_set_labels[i].replace('\n',' '))
    print(f'\t{txt}')

# Adjust layout and display the plots
plt.tight_layout()
sklearnex.unpatch_sklearn()

In [None]:
import matplotlib.pyplot as plt


# Create DataFrames with feature importances
# Create a DataFrame for feature importances using GradientBoostingRegressor
importance_gb = pd.DataFrame({'Importance': reg_gb.feature_importances_ * 100}, index= X.columns)

# Create a DataFrame for feature importances using XGBRegressor
importance_xgb = pd.DataFrame({'Importance': reg_xgb.feature_importances_ * 100}, index= X.columns)

# Create subplots
# Create a figure with two vertically stacked subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
axes = axes.ravel()

# Plot feature importance for GradientBoostingRegressor
# Create a bar plot for feature importance in the first subplot
axes[0].bar(importance_gb.index, importance_gb.Importance, color='#99f599', edgecolor='#006400', hatch="///")
axes[0].set_title('Feature Importance:\nGradient Boosting Regressor', fontsize=13, weight='bold', color='DarkSlateGray')
axes[0].set_ylim([0, 100])

# Plot feature importance for XGBRegressor
# Create a bar plot for feature importance in the second subplot
axes[1].bar(importance_xgb.index, importance_xgb.Importance, color='#e9aaaa', edgecolor='#8B0000', hatch="\\\\")
axes[1].set_title('Feature Importance:\nXGBoost Regressor', fontsize=12, weight='bold', color='DarkSlateGray')

# Common settings for both subplots
# Iterate through the axes and apply common settings
for ax in axes:
    ax.set_xlabel('Variable', weight='bold', color='MidnightBlue')
    ax.tick_params(axis='x', rotation=90, color='DimGray')
    ax.tick_params(axis='y', color='DimGray')
    ax.spines[['top', 'right']].set_visible(False)
    ax.spines[['bottom', 'left']].set_color('DimGray')
    ax.set_ylabel('Importance (%)', weight='bold', color='MidnightBlue')
    ax.grid(axis = 'x')

# Remove the ylabel for the right plot
axes[1].set_ylabel('')

# Adjust layout and display the plots
plt.tight_layout()

In [None]:
# Merge importance_gb and importance_xgb DataFrames
merged_importance = pd.merge(importance_gb, importance_xgb, left_index=True, right_index=True,
                              suffixes=(' (GB)', ' (XGB)'))
merged_importance.columns = [x.replace('Importance', 'Feature Importance Percentage') for x in merged_importance.columns]
# Display the merged DataFrame with background gradient
display(merged_importance.style.format(precision=2).background_gradient(cmap='YlGn', axis=1, vmin = 0, vmax =50))

## Gradient Boosting Classifier

In [None]:
try:
    from ucimlrepo import fetch_ucirepo
except ImportError:
    !pip3 install -U ucimlrepo
    from ucimlrepo import fetch_ucirepo

from ucimlrepo import fetch_ucirepo

# fetch dataset
rice_cammeo_and_osmancik = fetch_ucirepo(id=545)

# data (as pandas dataframes)
X = rice_cammeo_and_osmancik.data.features
y = rice_cammeo_and_osmancik.data.targets

# metadata
print(rice_cammeo_and_osmancik.metadata)

# variable information
print(rice_cammeo_and_osmancik.variables)

df = pd.concat([X, y], axis = 1)
display(df)

In [None]:
codes, uniques = pd.factorize(y.Class)
y = codes
label_mapping = dict(zip(np.arange(len(uniques)), uniques))

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=0, stratify=y)

import pandas as pd
import matplotlib.pyplot as plt

def _dist_plot(ax, y, label_mapping=label_mapping, CM=plt.cm.tab20c.colors, title=False):
    """
    Generate a pie chart illustrating the distribution of categories.

    Parameters:
    - ax: Axes object to plot on.
    - y: Input data for which the distribution is to be visualized.
    - label_mapping: Dictionary mapping category indices to labels.
    - CM: Color map for the pie chart.
    - title: Title for the plot. Set to False to omit.

    Returns:
    None
    """
    # Prepare data for the pie chart
    df = pd.Series(y).value_counts().to_frame('Count')

    # Create the pie chart
    wedges, texts, autotexts = ax.pie(df['Count'],
                                      labels=[label_mapping[i] for i in df.index],
                                      autopct='%1.1f%%', startangle=140,
                                      colors=CM,
                                      explode=[0, 0.1],
                                      shadow=True, wedgeprops={'edgecolor': 'whitesmoke'})
    # Set title and ensure equal aspect ratio for a circular pie chart
    if title:
        _ = ax.set_title(title, fontsize=16, weight='bold')
    _ = ax.axis('equal')

    # Highlight the labels with annotations
    for text, autotext in zip(texts, autotexts):
        text.set_fontsize(12)
        text.set_fontweight('bold')
        autotext.set_fontsize(12)
        autotext.set_fontweight('bold')


# Create the figure and axes
fig, ax = plt.subplots(1, 2, figsize=(9, 4.5))
_dist_plot(ax[0], y_train, CM = plt.cm.Pastel1.colors)
_ = ax[0].set_title(f'Train Set (Size = {len(X_train)})', fontsize=12, weight='bold', color='Green')
_dist_plot(ax[1], y_test, CM = plt.cm.Pastel2.colors)
_ = ax[1].set_title(f'Test Set (Size = {len(X_test)})', fontsize=12, weight='bold', color='Blue')
_ = fig.suptitle('Distribution of Categories', fontsize=16, weight='bold')

# Adjust layout and display the plot
plt.tight_layout()

In [None]:
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import sklearnex
sklearnex.patch_sklearn()
# ------------------------------
# Gradient Boosting Classifier
# ------------------------------

# Create a GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=0)

# Fit the classifier to the training data
gb_classifier.fit(X_train, y_train)

# Evaluate the classifier on the test data
gb_accuracy = gb_classifier.score(X_train, y_train)
print(f"Gradient Boosting Classifier Accuracy (Train): {gb_accuracy:.4f}")
gb_accuracy = gb_classifier.score(X_test, y_test)
print(f"Gradient Boosting Classifier Accuracy (Test): {gb_accuracy:.4f}")

# ------------------------------
# XGBoost Classifier
# ------------------------------

# Create an XGBClassifier
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=0)

# Fit the classifier to the training data
xgb_classifier.fit(X_train, y_train)

# Calculate accuracy using accuracy_score
xgb_accuracy = xgb_classifier.score(X_train, y_train)
print(f"XGB Classifier Accuracy (Train): {xgb_accuracy:.4f}")
xgb_accuracy = xgb_classifier.score(X_test, y_test)
print(f"XGB Classifier Accuracy (Test): {xgb_accuracy:.4f}")
sklearnex.unpatch_sklearn()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Create subplots for a 2 by 2 plot
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 6))

axes = axes.ravel()
# Confusion Matrix for GradientBoostingClassifier on Train Set
ConfusionMatrixDisplay.from_estimator(gb_classifier, X_train, y_train, ax=axes[0], colorbar = False, cmap = 'Greens')
axes[0].set_title("GBC Train Confusion Matrix")

# Confusion Matrix for GradientBoostingClassifier on Test Set
ConfusionMatrixDisplay.from_estimator(gb_classifier, X_test, y_test, ax=axes[1], colorbar = False, cmap = 'Blues')
axes[1].set_title("GBC Test Confusion Matrix")

# Confusion Matrix for XGBClassifier on Train Set
ConfusionMatrixDisplay.from_estimator(xgb_classifier, X_train, y_train, ax=axes[2], colorbar = False, cmap = 'Greens')
axes[2].set_title("XGBC Train Confusion Matrix")

# Confusion Matrix for XGBClassifier on Test Set
ConfusionMatrixDisplay.from_estimator(xgb_classifier, X_test, y_test, ax=axes[3], colorbar = False, cmap = 'Blues')
axes[3].set_title("XGBC Test Confusion Matrix")

for ax in axes:
  ax.grid(False)
  labels = [label_mapping[int(tick.get_text())] for tick in ax.get_xticklabels()]
  ax.set_xticklabels(labels)
  labels = [label_mapping[int(tick.get_text())] for tick in ax.get_yticklabels()]
  ax.set_yticklabels(labels)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt


# Create DataFrames with feature importances
# Create a DataFrame for feature importances using GradientBoostingClassifier
importance_gb = pd.DataFrame({'Importance': gb_classifier.feature_importances_ * 100}, index= X.columns)

# Create a DataFrame for feature importances using XGBClassifier
importance_xgb = pd.DataFrame({'Importance': xgb_classifier.feature_importances_ * 100}, index= X.columns)

# Create subplots
# Create a figure with two vertically stacked subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
axes = axes.ravel()

# Plot feature importance for GradientBoostingClassifier
# Create a bar plot for feature importance in the first subplot
axes[0].bar(importance_gb.index, importance_gb.Importance, color='#99f599', edgecolor='#006400', hatch="///")
axes[0].set_title('Feature Importance:\nGradient Boosting Classifier', fontsize=13, weight='bold', color='DarkSlateGray')
axes[0].set_ylim([0, 100])

# Plot feature importance for XGBClassifier
# Create a bar plot for feature importance in the second subplot
axes[1].bar(importance_xgb.index, importance_xgb.Importance, color='#e9aaaa', edgecolor='#8B0000', hatch="\\\\")
axes[1].set_title('Feature Importance:\nXGBoost Classifier', fontsize=12, weight='bold', color='DarkSlateGray')

# Common settings for both subplots
# Iterate through the axes and apply common settings
for ax in axes:
    ax.set_xlabel('Variable', weight='bold', color='MidnightBlue')
    ax.tick_params(axis='x', rotation=90, color='DimGray')
    ax.tick_params(axis='y', color='DimGray')
    ax.spines[['top', 'right']].set_visible(False)
    ax.spines[['bottom', 'left']].set_color('DimGray')
    ax.set_ylabel('Importance (%)', weight='bold', color='MidnightBlue')
    ax.grid(axis = 'x')

# Remove the ylabel for the right plot
axes[1].set_ylabel('')

# Adjust layout and display the plots
plt.tight_layout()