In [None]:
# !pip install scikit-learn --upgrade --force-reinstall

# Hyperparameter Tuning


## Splitting a Dataset into Train and Test sets

In the context of Machine Learning, there are two main approaches for splitting a dataset into train and test sets.

1. The first one is to use a simple train-test split, where you randomly divide the data into two subsets: one for training the model and one for evaluating its performance.

<center>
<img src="https://raw.githubusercontent.com/HatefDastour/ENSF444/12326debd66856cf557346ba47e221f0a43aa55a/Images/TrainTest_Split.png" alt="picture" width="900">
<br>
<b>Figure</b>: Train and Test splitting of a dataset.
</center>

2. The second one is to use a train-validation-test split, where you further split the training set into two subsets: one for training the model and one for tuning its hyperparameters. The test set is only used for the final evaluation of the model.

<center>
<img src="https://raw.githubusercontent.com/HatefDastour/ENSF444/12326debd66856cf557346ba47e221f0a43aa55a/Images/TrainValTest_Split.png" alt="picture" width="900">
<br>
<b>Figure</b>: Train, Validation and Test splitting of a dataset.
</center>

The advantage of the second approach is that it allows you to optimize the model's parameters without using the test set, which reduces the risk of overfitting and improves the generalization ability of the model. The disadvantage is that it reduces the amount of data available for training the model, which may affect its accuracy.

## Searching for Optimal Hyperparameters

<font color='Blue'><b>Example:</b></font>

In [None]:
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

colors = ["#f5645a", "#0096ff", '#B2FF66']
edge_colors = ['#8A0002', '#2e658c', '#6A993D']
markers = ['o', '*', 's']

import numpy as np
import matplotlib.pyplot as plt

def plot_class_distribution(ax, y,
                            colors = colors,
                            edge_colors = edge_colors,
                            title = 'Class Distribution'):

    # Calculate bar heights and labels
    bar_heights, bar_labels = np.unique(y, return_counts=True)

    # Create bars
    bars = ax.bar(bar_heights, bar_labels, color=colors, edgecolor=edge_colors)

    # Add xticks with labels
    ax.set_xticks(bar_heights)
    ax.set_xticklabels(map(str, bar_heights))

    # Add grid and title
    ax.grid(which='major', axis='y')
    ax.set_title(title, weight='bold', fontsize=16, y=1.02)

    # Add labels for bar heights inside each bar
    for bar in bars:
        height = bar.get_height()
        percentage = 100 * height / len(y)
        ax.annotate(f'{percentage:.1f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords='offset points', ha='center', fontsize=12)

# Generate synthetic data
X, y = make_blobs(n_samples = [500, 300, 200],
                  centers=[[0, 0], [2, 2], [4, 4]],
                  n_features=2,
                  random_state=0, cluster_std=[1.0, 1, .6])

# Create a scatter plot using Seaborn
fig, ax = plt.subplots(1, 2, figsize=(10, 7), gridspec_kw={'width_ratios': [8, 2]})

for num in np.unique(y):
    ax[0].scatter(X[:, 0][y == num], X[:, 1][y == num], c=colors[num],
               s=40, ec=edge_colors[num], marker=markers[num], label=str(num))

ax[0].grid(True)
ax[0].legend(title='Class', fontsize=14)
ax[0].set(xlim = [-4, 6], ylim = [-4, 6], xlabel = 'Feature 1', ylabel = 'Feature 2')
ax[0].set_title('Synthetic Dataset', weight = 'bold', fontsize = 16, y = 1.02)

plot_class_distribution(ax[1], y, colors = colors, edge_colors = edge_colors)

plt.tight_layout()

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, stratify=y, random_state=0)

# Further split the train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, stratify=y_trainval, random_state=0)

# Print the sizes of the training, validation, and test sets
print("Size of training set: {}   size of validation set: {}   size of test set: {}"
      "\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

# Create subplots for visualizing class distribution
fig, ax = plt.subplots(1, 3, figsize=(10, 5), sharey=True)

# Iterate over train, validation, and test sets for plotting class distribution
for ax, y_set, title, textcolor in zip(ax, [y_train, y_valid, y_test], ['Train', 'Validation', 'Test'],
                                       ['#0f4539', '#ff9317', '#2986cc']):
    # Plot the class distribution on the current subplot
    plot_class_distribution(ax=ax, y=y_set, title=title)

    # Set y-axis limits for better visualization
    ax.set_ylim([0, 300])

    # Set subplot title with specified font properties
    ax.set_title(title, fontdict={'size': 12, 'weight': 'bold', 'color': textcolor})

# Set the overall title for the entire figure
fig.suptitle('Class Distribution', weight='bold', fontsize=12)

# Adjust layout for better visualization
plt.tight_layout()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

# Instantiate a RandomForestClassifier with a specified random seed
rfc = RandomForestClassifier(random_state=0)

# Train the model on the combined Train and Validation Sets
rfc.fit(X_trainval, y_trainval)

# Calculate accuracy scores for the combined Train and Validation Sets and the Test Set
train_score = rfc.score(X_trainval, y_trainval)
test_score = rfc.score(X_test, y_test)

# Print accuracy scores
print(f"Accuracy Score (Train and Validation Sets combined): {train_score:.4f}")
print(f"Accuracy Score (Test Set): {test_score:.4f}")

# Display default parameters of the RandomForestClassifier
print('\nDefault Parameters:')
pprint(rfc.get_params(deep=True))

In [None]:
import pandas as pd

# Initialize variables to track the best score and corresponding parameters
best_score = 0
best_parameters = {}

# Define the parameter grid for Random Forest
param_grid = dict(n_estimators=[100, 150, 200],
                  max_depth=[3, 5, 7])

# Iterate over combinations of n_estimators and max_depth
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        # Instantiate a RandomForestClassifier with the current parameters
        rfc = RandomForestClassifier(n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     random_state=0)

        # Train the model on the training set
        rfc.fit(X_train, y_train)

        # Evaluate the model on the validation set
        score = rfc.score(X_valid, y_valid)

        # If the current score is better than the best score, update best score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'n_estimators': n_estimators, 'max_depth': max_depth}

# Print the best score and corresponding parameters on the validation set
print("Best score on validation set: {:.4f}".format(best_score))
print("Best parameters: ", best_parameters)

In [None]:
# Rebuild a Random Forest model on the combined training and validation set
# using the best parameters obtained from the parameter search
rfc = RandomForestClassifier(random_state=0, **best_parameters)

# Train the model on the combined Train and Validation Sets
rfc.fit(X_trainval, y_trainval)

# Evaluate the model on the training and test sets
train_score = rfc.score(X_trainval, y_trainval)
test_score = rfc.score(X_test, y_test)

# Print accuracy scores for the combined Train and Validation Sets and the Test Set
print(f"Accuracy Score (Train and Validation Sets combined): {train_score:.4f}")
print(f"Accuracy Score (Test Set): {test_score:.4f}")

# Display the current parameters of the Random Forest model
print('\nCurrent Parameters:')
pprint(rfc.get_params(deep=True))

The “best” parameters may not actually be the best parameters. These are the best parameters that we have found based on the search. If we add more details to our search, we may find better hyperparameters (better in terms of overall accuracy).

## Cross-validated Grid-search

GridSearchCV is a function that performs **hyperparameter optimization** by training and evaluating a machine learning model using different combinations of hyperparameters. The best set of hyperparameters is then selected based on a specified performance metric.

GridSearchCV has two main advantages:

- It applies a **grid search** to an array of hyperparameters, which means it tries every possible combination of the values you specify for each hyperparameter.
- It **cross-validates** your model using **k-fold cross-validation**, which means it splits your data into k subsets and uses one subset as the test set and the rest as the training set. It repeats this process k times, each time using a different subset as the test set. This could help to avoid overfitting and gives a more reliable estimate of the model's performance.

To use GridSearchCV, you need to pass the following parameters:

- **estimator**: the machine learning model you want to tune, such as a classifier or a regressor.
- **param_grid**: a dictionary or a list of dictionaries with the hyperparameter's names as keys and the values to try as values.
- **scoring**: a string, a callable, a list, or a dictionary that defines the metric or metrics to use to evaluate the model's performance.
- **n_jobs**: an integer that specifies the number of parallel jobs to run. -1 means using all processors.
- **refit**: a boolean, a string, or a callable that determines whether to refit the model with the best-found parameters on the whole dataset and make it available as the **best_estimator_** attribute.

<font color='Blue'><b>Example:</b></font>

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

# Create Gradient Boosting Classifier instance
rfc = RandomForestClassifier(random_state=0)

# Define the parameter grid for Random Forest
param_grid = dict(n_estimators=[100, 150, 200],
                  max_depth=[3, 5, 7])

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator = rfc,
                           param_grid = param_grid,
                           cv=5,
                           scoring='accuracy')

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Display the best parameters and corresponding accuracy score
print("Best Parameters:", grid_search.best_params_)
print(f"Best Accuracy Score on Training Data: {grid_search.best_score_:.4f}")

# Evaluate the model on the test set
test_accuracy = grid_search.best_estimator_.score(X_test, y_test)
print(f"Accuracy Score on Test Data: {test_accuracy:.4f}")

In [None]:
import pandas as pd
# convert to DataFrame
results = pd.DataFrame(grid_search.cv_results_).sort_values(by = 'rank_test_score').reset_index(drop = True)
display(results)

In [None]:
# Rebuild a Random Forest model using the best parameters obtained from the parameter search
rfc = RandomForestClassifier(random_state=0, **grid_search.best_params_)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model on the training and test sets
train_score = rfc.score(X_train, y_train)
test_score = rfc.score(X_test, y_test)

# Print accuracy scores for the combined Train and Validation Sets and the Test Set
print(f"Accuracy Score (Train Set): {train_score:.4f}")
print(f"Accuracy Score (Test Set): {test_score:.4f}")

# Display the current parameters of the Random Forest model
print('\nCurrent Parameters:')
pprint(rfc.get_params(deep=True))

## Halving Grid Search CV

HalvingGridSearchCV is a scikit-learn estimator that performs a grid search over specified parameter values with successive halving. Successive halving is a search strategy that starts evaluating all the candidates with a small amount of resources and iteratively selects the best candidates, using more and more resources. This can be much faster than a regular grid search, especially when the number of candidates is large.

The hyperparameters of HalvingGridSearchCV are:

- `estimator`: the estimator object that implements the scikit-learn estimator interface. It must have a score function or a scoring parameter must be provided.
- `param_grid`: a dictionary or a list of dictionaries that define the parameter names and values to try. Each dictionary corresponds to a grid of parameter combinations to explore.
- `factor`: the halving parameter that determines the proportion of candidates that are selected for each subsequent iteration. For example, factor=3 means that only one third of the candidates are selected.
- `resource`: the resource that increases with each iteration. By default, it is the number of samples, but it can also be any parameter of the base estimator that accepts positive integer values, such as `n_iterations` or `n_estimators`.
- `max_resources`: the maximum amount of resource that any candidate is allowed to use for a given iteration. By default, it is set to the number of samples when resource='n_samples', otherwise it must be specified explicitly.
- `min_resources`: the minimum amount of resource that any candidate is allowed to use for a given iteration. It can be either 'exhaust', which sets it such that the last iteration uses as much resources as possible, or 'smallest', which sets it to a small heuristic value based on the problem type and the number of splits, or an integer value.
- `aggressive_elimination`: a boolean flag that determines whether to eliminate candidates that do not meet the required rate of improvement. If True, it can speed up the search, but it can also introduce some variability in the results.
- `cv`: the cross-validation scheme to use for splitting the data. It can be either an integer, a cross-validation generator, or an iterable of train/test splits.
- `scoring`: the scoring function or a list of scoring functions to evaluate the candidates. It can be either a string, a callable, or None, in which case the estimator's score function is used.
- `refit`: a boolean flag or a string that determines whether to refit the estimator using the best found parameters. If True, the whole dataset is used for refitting. If a string, it must be the name of a scoring function, and the best parameters are chosen based on that score. If False, no refitting is done and the estimator is not available at the end.
- `error_score`: the value to assign to the score if an error occurs during fitting. It can be either 'raise', which will raise the error, or a numeric value, which will be used as the score.
- `return_train_score`: a boolean flag that determines whether to return the training scores along with the test scores.
- `random_state`: the random state or seed to use for reproducibility. It can be either an integer, a RandomState instance, or None, in which case the global random state is used.
- `n_jobs`: the number of jobs to run in parallel. It can be either an integer, or None, in which case one job is used.
- `verbose`: the verbosity level. It can be either an integer, or None, in which case the default verbosity is used.

In [None]:
# On Google Colab, recommend to upgrade sklearn to 1.4,1
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

# Create Random Forest Classifier instance
rfc = RandomForestClassifier(random_state=0)

# Define the parameter grid for Random Forest
param_grid = dict(n_estimators=[100, 150, 200],
                  max_depth=[3, 5, 7])

# Initialize HalvingGridSearchCV
halving_grid_search = HalvingGridSearchCV(estimator = rfc,
                                          param_grid = param_grid,
                                          cv=5,
                                          return_train_score = True,
                                          scoring='accuracy',
                                          random_state = 1)

# Perform halving grid search on the training data
halving_grid_search.fit(X_train, y_train)

# Display the best parameters and corresponding accuracy score
print("Best Parameters:", halving_grid_search.best_params_)
print(f"Best Accuracy Score on Training Data: {halving_grid_search.best_score_:.4f}")

# Evaluate the model on the test set
test_accuracy = halving_grid_search.best_estimator_.score(X_test, y_test)
print(f"Accuracy Score on Test Data: {test_accuracy:.4f}")

In [None]:
results = pd.DataFrame(halving_grid_search.cv_results_)
display(results)

In [None]:
# Rebuild a Random Forest model using the best parameters obtained from the parameter search
rfc = RandomForestClassifier(random_state=0, **halving_grid_search.best_params_)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model on the training and test sets
train_score = rfc.score(X_train, y_train)
test_score = rfc.score(X_test, y_test)

# Print accuracy scores for the combined Train and Validation Sets and the Test Set
print(f"Accuracy Score (Train Set): {train_score:.4f}")
print(f"Accuracy Score (Test Set): {test_score:.4f}")

# Display the current parameters of the Random Forest model
print('\nCurrent Parameters:')
pprint(rfc.get_params(deep=True))