# Random Forests

##  Random Forest Regressor

<font color='Blue'><b>Example</b></font>. The Auto MPG dataset retrieved from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/dataset/9/auto+mpg).

In [None]:
try:
    from ucimlrepo import fetch_ucirepo
except ImportError:
    !pip3 install -U ucimlrepo
    from ucimlrepo import fetch_ucirepo
import numpy as np

# fetch dataset
auto_mpg = fetch_ucirepo(name = 'Auto MPG')

# data (as pandas dataframes)
X = auto_mpg.data.features
y = auto_mpg.data.targets

# drop rows with missing values from X
X = X.dropna(axis=0, how='any')

# align X and y by index
X, y = X.align(y, join='inner', axis=0)

# ln(mpg)
y = np.log(y['mpg'])
y.name = 'ln(mpg)'
print('X:')
display(X)
print('\ny:')
print(y)
print('\nInfo:')
X.info()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Create a DataFrame to display the sizes of the training and testing sets
set_size_df = pd.DataFrame({'Size': [len(X_train), len(X_test)]}, index=['Train', 'Test'])
display(set_size_df.T)

Our goal is to create a Random Forest Regression model with certain specifications. In this setup, the model consists of four decision trees, each constrained to a maximum of three leaf nodes. The intention behind these parameter choices is to build an ensemble of decision trees that work together to make accurate regression predictions. The restriction on the number of nodes in each tree serves to manage the overall complexity of the model. The next step involves training the model using the provided training data, where `X_train` represents the features, and `y_train` represents the corresponding target values. Throughout this training process, the model evaluates the importance of each feature, contributing to a comprehensive understanding of its predictive capabilities.

In [None]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score

# Set the custom style for plotting
plt.style.use('https://raw.githubusercontent.com/HatefDastour/ENSF444/main/Files/mystyle.mplstyle')

# Load the diabetes dataset
diabetes_data = load_diabetes()
X_diabetes = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)
y_diabetes = pd.Series(diabetes_data.target, name='target')

# RandomForestRegressor with specified parameters
rfr = RandomForestRegressor(n_estimators=4, random_state=0, max_leaf_nodes=3)
rfr.fit(X_train, y_train)

# Create subplots for each estimator
fig, ax = plt.subplots(2, 2, figsize=(11, 8))
ax = ax.ravel()

# Initialize DataFrame for feature importance
feat_importance_df = pd.DataFrame()

# Iterate over estimators to plot trees and calculate MSE
for i, (estimator, ax) in enumerate(zip(rfr.estimators_, ax), start=1):
    tree.plot_tree(estimator, ax=ax, feature_names=X.columns.tolist(), filled=True,
                   fontsize=11, rounded=True)

    ax.set_title(f'Estimator {i}', fontsize=14, weight='bold')

    # Calculate MSE for both training and test sets
    mse_train = metrics.mean_squared_error(y_train, estimator.predict(X_train.values))
    mse_test = metrics.mean_squared_error(y_test, estimator.predict(X_test.values))
    txt = f'MSE (Train) = {mse_train:.5f}\nMSE (Test) = {mse_test:.5f}'
    print(f'\nEstimator {i}:\n'+ txt)

    # Display MSE values on each subplot
    text = ax.text(0.4, -0.05, txt,
                  transform=ax.transAxes, fontsize=11, weight='bold',
                  bbox=dict(facecolor='#dfc8f0', alpha=0.7))

    # Create DataFrame with feature importances for each estimator
    df_temp = pd.DataFrame({f'Estimator {i}': 100*estimator.feature_importances_}, index=X.columns)
    feat_importance_df = pd.concat([feat_importance_df, df_temp], axis=1)

# Ensure tight layout for subplots
plt.tight_layout()

# Apply background gradient to the DataFrame and round importance values to 2 decimal places
styled_importance = feat_importance_df.style.\
                    background_gradient(cmap='Reds', axis=1, vmin=0, vmax=100).format(precision=2)

# Display the styled DataFrame
print('\nFeature Importance:')
display(styled_importance)

In [None]:
# Calculate MSE for both training and test sets
mse_train = metrics.mean_squared_error(y_train, rfr.predict(X_train))
mse_test = metrics.mean_squared_error(y_test, rfr.predict(X_test))
txt = f'MSE (Train) = {mse_train:.5f}\nMSE (Test) = {mse_test:.5f}'
print(txt)

# Create a DataFrame to store feature importances
Importance = pd.DataFrame({'Importance': 100*rfr.feature_importances_}, index=X.columns)

# Apply a background gradient to the DataFrame and round importance values to 2 decimal places
styled_importance = Importance.style.background_gradient(cmap='Oranges',
                                                         subset=['Importance']).format({'Importance': '{:.2f}'})

# Display the styled DataFrame
display(styled_importance)

# Create a bar plot to visualize feature importances
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
bars = ax.bar(Importance.index, Importance.Importance,
              color='#f9cb9c', edgecolor='#cc0000', hatch="\\\\", lw=2, zorder=2)

# Set plot labels and title
ax.set_xlabel('Features', fontsize=12, weight='bold', color='#191970')
ax.set_ylabel('Importance', fontsize=12, weight='bold', color='#191970')
ax.set_title('Feature Importance', y=1.05,
             fontsize=16, weight='bold', color='#2F4F4F')

# Set y-axis limits and adjust tick parameters
ax.set_ylim([0, 80])
ax.tick_params(axis='x', rotation=45, labelsize=12, color='#696969')
ax.tick_params(axis='y', labelsize=12, color='#696969')

# Customize plot aesthetics
ax.spines[['top', 'right']].set_visible(False)
ax.spines[['bottom', 'left']].set_color('#696969')
ax.grid(axis='x')

# Ensure a tight layout for better visualization
plt.tight_layout()

### Predictions

In [None]:
# Select a sample from the test set for prediction
sample_x_test = X_test.iloc[-2:-1]
display(sample_x_test)

# Initialize a list to store individual predictions from each estimator
pred_list = []

# Iterate over estimators to make predictions and display individual results
for i, estimator in enumerate(rfr.estimators_, start=1):
    pred_ = estimator.predict(sample_x_test.values)[0]
    pred_list.append(pred_)
    print(f'Prediction from Estimator {i} = {pred_:.6f}')

# Display the mean prediction from all estimators and the prediction from the RandomForestRegressor
print(f'\nMean Prediction from All Estimators = {np.mean(pred_list):.6f}')
print(f'\nPrediction from RFR = {rfr.predict(sample_x_test)[0]:.6f}')

### Number of Estimators

In this code, we are assessing the performance of Random Forest Regressors with varying configurations of the maximum number of leaf nodes (`max_leaf_nodes`) and the number of estimators (`n_estimators`). The experiment involves training the models on a given dataset and recording the mean squared errors on both the training and test sets across different hyperparameter settings.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from ucimlrepo import fetch_ucirepo
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define the ensemble regressors with specific parameters
ensemble_regrs = [("RandomForestRegressor, max_leaf_nodes=5",
                    RandomForestRegressor(random_state=0, max_leaf_nodes=5),
                   ),
                  ("RandomForestRegressor, max_leaf_nodes=7",
                    RandomForestRegressor(random_state=0, max_leaf_nodes=7),
                   ),
                  ("RandomForestRegressor, max_leaf_nodes=11",
                    RandomForestRegressor(random_state=0, max_leaf_nodes=11),
                   )
]

# Initialize dictionaries to store error rates and test mean squared errors
train_error = OrderedDict((label, []) for label, _ in ensemble_regrs)
test_error = OrderedDict((label, []) for label, _ in ensemble_regrs)

# Define the range of `n_estimators` values to explore
min_estimators = 15
max_estimators = 120

# Iterate over ensemble regressors and `n_estimators` values
for label, regr in ensemble_regrs:
    for i in range(min_estimators, max_estimators + 1, 5):
        # Set the number of estimators
        regr.set_params(n_estimators=i)

        # Fit the model on the training data
        regr.fit(X_train, y_train)

        # Record the train mean squared error for each `n_estimators=i` setting
        train_pred = regr.predict(X_train)
        train_mse = mean_squared_error(y_train, train_pred)
        train_error[label].append((i, train_mse))

        # Record the test mean squared error for each `n_estimators=i` setting
        test_pred = regr.predict(X_test)
        test_mse = mean_squared_error(y_test, test_pred)
        test_error[label].append((i, test_mse))

In [None]:
# Create subplots using fig and ax, sharing x-axis
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 6), sharex=True)

# Plot training mean squared errors for each ensemble regressor
for label, regr_err in train_error.items():
    xs, ys = zip(*regr_err)
    ax1.plot(xs, ys, lw=2, label=label)

# Plot test mean squared errors for each ensemble regressor
for label, regr_err in test_error.items():
    xs, ys = zip(*regr_err)
    ax2.plot(xs, ys, lw=2, label=label)

# Set y-axis to logarithmic scale
ax1.set_yscale('log')
ax2.set_yscale('log')

# Set plot parameters for the training set plot
ax1.set_xlim(min_estimators, max_estimators)
ax1.set_ylabel("Mean Squared Error\n(Train Set)")

# Set plot parameters for the test set plot
ax2.set_xlim(min_estimators, max_estimators)
ax2.set_xlabel("n_estimators")
ax2.set_ylabel("Mean Squared Error\n(Test Set)")

# Combine legends for both plots
ax1.legend(loc="upper left", bbox_to_anchor=(0.56, 1.35))

# Ensure a tight layout for better visualization
plt.tight_layout()

## Random Forest Classifier

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import load_wine
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Use a custom style for the plot (adjust the path to your style file)
plt.style.use('https://raw.githubusercontent.com/HatefDastour/ENSF444/main/Files/mystyle.mplstyle')

# Load the wine dataset
wine_data = load_wine()
X, y = wine_data.data, wine_data.target
feature_names = wine_data.feature_names

# Create a DataFrame for visualization
df = pd.DataFrame(data=X, columns=feature_names)
df['class'] = y
display(df)

# Display the dataset description
print(wine_data.DESCR)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Initialize RandomForestClassifier with specified parameters
rfc = RandomForestClassifier(n_estimators=4, random_state=0, max_leaf_nodes=3)
rfc.fit(X_train, y_train)

# Display accuracy score for each estimator
for i, estimator in enumerate(rfc.estimators_, start=1):
    accuracy_train = metrics.accuracy_score(estimator.predict(X_train), y_train)
    accuracy_test = metrics.accuracy_score(estimator.predict(X_test), y_test)
    txt = f'Estimator {i}: Accuracy (Train) = {accuracy_train:.4f}, Accuracy (Test) = {accuracy_test:.4f}'
    print(txt)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree

# Plot decision trees in a 2 by 2 layout
fig, axes = plt.subplots(2, 2, figsize=(13, 9))
axes = axes.ravel()

for i, (estimator, ax) in enumerate(zip(rfc.estimators_, axes), start=1):
    # Plot decision tree
    plot_tree(estimator, filled=False, feature_names=feature_names, class_names= wine_data.target_names,
              ax=ax, fontsize=11, impurity=True, rounded=True, proportion= True)
    ax.set_title(f'Estimator {i}', fontsize=14, weight='bold')

# Add a super title for the entire figure
fig.suptitle('Decision Trees', fontsize=16, weight='bold')

# Adjust layout and display the plots
plt.tight_layout()

In [None]:
accuracy_train = metrics.accuracy_score(y_train, rfc.predict(X_train))
accuracy_test = metrics.accuracy_score(y_test, rfc.predict(X_test))
txt = f'Accuracy Score(Train) = {accuracy_train:.5f}\nAccuracy Score(Test) = {accuracy_test:.5f}'
print(txt)

# Create a DataFrame to store feature importances
Importance = pd.DataFrame({'Importance': 100*rfc.feature_importances_}, index = feature_names)

# Apply a background gradient to the DataFrame and round importance values to 2 decimal places
styled_importance = Importance.style.background_gradient(cmap='Oranges',
                                                         subset=['Importance']).format({'Importance': '{:.2f}'})

# Display the styled DataFrame
display(styled_importance)

# Create a bar plot to visualize feature importances
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
bars = ax.bar(Importance.index, Importance.Importance,
              color='#f9cb9c', edgecolor='#cc0000', hatch="\\\\", lw=2, zorder = 2)

# Set plot labels and title
ax.set_xlabel('Features', fontsize=12, weight='bold', color='#191970')
ax.set_ylabel('Importance', fontsize=12, weight='bold', color='#191970')
ax.set_title('Feature Importance', y = 1.05,
             fontsize=16, weight='bold', color='#2F4F4F')

# Set y-axis limits and adjust tick parameters
ax.set_ylim([0, 40])
ax.tick_params(axis='x', rotation=90, labelsize=12, color='#696969')
ax.tick_params(axis='y', labelsize=12, color='#696969')

# Customize plot aesthetics
ax.spines[['top', 'right']].set_visible(False)
ax.spines[['bottom', 'left']].set_color('#696969')
ax.grid(axis='x')

# Ensure a tight layout for better visualization
plt.tight_layout()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Use a custom style for the plot (adjust the path to your style file)
plt.style.use('https://raw.githubusercontent.com/HatefDastour/ENSF444/main/Files/mystyle.mplstyle')

def plot_cm(model, X_train, X_test, y_train, y_test, class_names, figsize=(12, 8), title='Confusion Matrices'):
    # Create a figure and axes for displaying confusion matrices side by side
    fig, ax = plt.subplots(1, 2, figsize=figsize)

    datasets = [(X_train, y_train, 'Train'), (X_test, y_test, 'Test')]

    for i in range(2):
        X, y, dataset_name = datasets[i]

        # Compute confusion matrix for the dataset predictions
        cm = confusion_matrix(y, model.predict(X))

        # Create a ConfusionMatrixDisplay and plot it on the respective axis
        cm_display = ConfusionMatrixDisplay(cm, display_labels=class_names) \
            .plot(ax=ax[i],
                  im_kw=dict(cmap='Greens' if dataset_name == 'Train' else 'Blues'),
                  text_kw={"size": 16}, colorbar=False)
        ax[i].set_title(f'{dataset_name} Data')
        ax[i].grid(False)

    # Rotate x-axis labels
    for ax_ in ax:
        ax_.set_xticklabels(ax_.get_xticklabels(), rotation=45, ha='right')

    # Add a super title for the entire figure
    fig.suptitle(title, fontsize=16, weight='bold')

    # Adjust the layout for better spacing
    plt.tight_layout()

# Assuming rfc and axes are defined before this point
for i, (estimator, ax) in enumerate(zip(rfc.estimators_, axes), start=1):
    plot_cm(estimator, X_train, X_test, y_train, y_test, class_names=wine_data.target_names,
            title=f'Confusion Matrices for Estimator {i}', figsize=(6, 3))

In [None]:
plot_cm(rfc, X_train, X_test, y_train, y_test,
        class_names = wine_data.target_names, figsize=(7, 4), title='Confusion Matrices for RFC')

### Predictions

In [None]:
# Select a sample from the test set for prediction
sample_x_test = X_test[-2:-1, :]
display(pd.DataFrame(sample_x_test, columns=feature_names))

# Initialize a list to store individual predictions from each estimator
pred_list = []

# Function to find the most frequent item in a list
def most_frequent_item(lst):
    unique_elements, counts = np.unique(lst, return_counts=True)
    index_of_max_frequency = np.argmax(counts)
    most_frequent_string = unique_elements[index_of_max_frequency]
    return most_frequent_string

# Iterate over estimators to make predictions and display individual results
for i, estimator in enumerate(rfc.estimators_, start=1):
    pred_ = estimator.predict(sample_x_test)[0]
    pred_ = wine_data.target_names[int(pred_)]
    pred_list.append(pred_)
    print(f'Prediction from Estimator {i} = {pred_}')

# Display the mode prediction from all estimators
print(f'\nMode Prediction from All Estimators = {most_frequent_item(pred_list)}')

# Make a prediction using the RandomForestClassifier and display the result
pred_ = rfc.predict(sample_x_test)[0]
pred_ = wine_data.target_names[int(pred_)]
print(f'\nPrediction from RFC = {pred_}')