In [3]:
from sklearn import datasets # Import the datasets module from scikit-learn to load sample datasets
import pandas as pd # Import pandas for data manipulation and analysis (e.g., creating DataFrames)
from scipy.stats import randint as sp_rand # Import randint from scipy.stats for generating random integers (used in hyperparameter tuning)
from sklearn.model_selection import train_test_split # Import train_test_split to split the dataset into training and testing sets
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier for building a random forest classification model
from sklearn.metrics import accuracy_score # Import accuracy_score to evaluate the accuracy of the model
from sklearn.experimental import enable_halving_search_cv # Enable experimental halving search CV (cross-validation) methods for hyperparameter tuning
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV # Import GridSearchCV and HalvingGridSearchCV for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, HalvingRandomSearchCV # Import RandomizedSearchCV and HalvingRandomSearchCV for hyperparameter tuning
from datetime import datetime # Import datetime to record the current date and time


# Load the Breast cancer dataset from scikit-learn library and convert it to a pandas DataFrame
dataset = datasets.load_breast_cancer() # Load the Breast cancer dataset
data = pd.DataFrame(dataset.data, columns=dataset.feature_names)    # Convert the dataset to a pandas DataFrame
data['target'] = dataset.target # Add the target column to the DataFrame
data.head() # Display the first few rows of the DataFrame

# Creating dataset variables
X = dataset.data # Features
Y = dataset.target  # Target

# Splitting the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) # Split the dataset into training and testing sets


# Manual Search
# ----------------------------Hyperparameter Space--------------------------------
params_1 = {'n_estimators': 10, # Number of trees in the forest 
            'criterion': 'entropy', # The function to measure the quality of a split
            'max_features': 15, # The number of features to consider when looking for the best split
            'min_samples_split': 6, # The minimum number of samples required to split an internal node
            'min_samples_leaf': 8, # The minimum number of samples required to be at a leaf node
            'bootstrap': True} # Whether bootstrap samples are used when building trees

params_2 = {'n_estimators': 50, # Number of trees in the forest 
            'criterion': 'entropy', # The function to measure the quality of a split
            'max_features': 30, # The number of features to consider when looking for the best split
            'min_samples_split': 10, # The minimum number of samples required to split an internal node
            'min_samples_leaf': 11, # The minimum number of samples required to be at a leaf node
            'bootstrap': True} # Whether bootstrap samples are used when building trees

params_3 = {'n_estimators': 80, # Number of trees in the forest 
            'criterion': 'gini', # The function to measure the quality of a split
            'max_features': 30, # The number of features to consider when looking for the best split
            'min_samples_split': 10, # The minimum number of samples required to split an internal node
            'min_samples_leaf': 6, # The minimum number of samples required to be at a leaf node
            'bootstrap': False} # Whether bootstrap samples are used when building trees

# ----------------------------Create and fit the model--------------------------------
model_1 = RandomForestClassifier(**params_1)
model_2 = RandomForestClassifier(**params_2)
model_3 = RandomForestClassifier(**params_3)

#Training Models
model_1.fit(X_train, Y_train)
model_2.fit(X_train, Y_train)
model_3.fit(X_train, Y_train)

# Results 
print(f'Model 1 accuracy: {model_1.score(X_test, Y_test)}')
print(f'Model 2 accuracy: {model_2.score(X_test, Y_test)}')
print(f'Model 3 accuracy: {model_3.score(X_test, Y_test)}')



# Grid Search Example
#-----------------------------Hyperparameter Space--------------------------------
h_space = {'n_estimators': [30, 60, 80, 100],
              'criterion': ['gini', 'entropy'],
              'max_features': [10, 20, 25, 30],
              'min_samples_split': [5, 11],
              'min_samples_leaf': [5, 11],
              'bootstrap': [True, False]}
# Creating and fitting the model
Random_forest_clf = RandomForestClassifier()

#Creating and training the models
#The datetime function has been used to calculate operation time
start = datetime.now()
models = GridSearchCV(Random_forest_clf, param_grid=h_space, cv=5)
models.fit(X_train, Y_train)
end = datetime.now()

# Getting 5-fold cross-validation results
scores = models.cv_results_['mean_test_score']
# Getting best hyperparameters
best_hparams = models.best_params_

print(f'Duration for Grid Search Example: {end-start}')
print(f'Best model training score for Grid Search Example: {max(scores)}')
print(f'Best hyperparameters for Grid Search Example: {best_hparams}')


#-----------------------------Training the model--------------------------------
# Training the model with the best hyperparameters from the Grid Search
best_model = RandomForestClassifier(bootstrap= True,
                                    criterion = 'entropy',
                                    max_features = 10,
                                    min_samples_leaf = 5,
                                    min_samples_split= 5,
                                    n_estimators= 80)
best_model.fit(X_train, Y_train)
print(f'Best model accuracy for Grid Search Example: {best_model.score(X_test, Y_test)}')



#Halving Grid Search Example
#-----------------------------Hyperparameter Space--------------------------------
h_space = {'n_estimators': [30, 60, 80, 100],
              'criterion': ['gini', 'entropy'],
              'max_features': [10, 20, 25, 30],
              'min_samples_split': [5, 11],
              'min_samples_leaf': [5, 11],
              'bootstrap': [True, False]}
# Creating and fitting the model
Random_forest_clf = RandomForestClassifier()
# -----------------------------Create and fit the model-------------------------------
# The hyperparameter space is the same as the grid search
start = datetime.now()
# The datetime function has been used to calculate operation time 
models = HalvingGridSearchCV(Random_forest_clf,
                             param_grid = h_space, cv = 5)
models.fit(X_train, Y_train)
end = datetime.now()

#Getting 5-fold cross validated score
scores = models.cv_results_['mean_test_score']
# Getting best hyperparameters
best_hparams = models.best_params_

print(f'Duration for Halving Grid Search: {end-start}')
print(f' Best model training score for Halving Grid Search: {max(scores)}')
print(f'Best hyperparameters for Halving Grid Search: {best_hparams}') 

#-----------------------------Training the model--------------------------------
# Training the model with the best hyperparameters from the Halving Grid Search
best_model = RandomForestClassifier(bootstrap= True,
                                    criterion = 'gini',
                                    max_features = 10,
                                    min_samples_leaf = 5,
                                    min_samples_split= 5,
                                    n_estimators= 100)
best_model.fit(X_train, Y_train)    
print(f'Best model accuracy for Halving Grid Search: {best_model.score(X_test, Y_test)}')

# --------------------------------Random Search Example--------------------------------
# ----------------------------Hyperparameter Space--------------------------------
h_space = {'bootstrap': [True, False],
           'criterion': ['gini', 'entropy'],
           'max_features': sp_rand(20,30),
           'min_samples_split': sp_rand(2,11),
           'min_samples_leaf': sp_rand(2,11),
           'n_estimators': sp_rand(30, 100)}

# ----------------------------Create and fit the model--------------------------------
start = datetime.now()
models = RandomizedSearchCV(Random_forest_clf, param_distributions=h_space, cv = 5, random_state=42)
models.fit(X_train, Y_train)
end = datetime.now()

# Getting 5-fold cross-validation results
scores = models.cv_results_['mean_test_score']
# Getting best hyperparameters
best_hparams = models.best_params_

print(f'Duration for Random Search Example: {end-start}')
print(f'Best model training score for Random Search Example: {max(scores)}')
print(f'Best hyperparameters for Random Search Example: {best_hparams}')

# ----------------------------Training the model--------------------------------
# Training the model with the best hyperparameters from the Random Search
best_model = RandomForestClassifier(bootstrap= True,
                                    criterion = 'gini',
                                    max_features = 24,
                                    min_samples_leaf = 4,
                                    min_samples_split= 8,
                                    n_estimators= 50)

best_model.fit(X_train, Y_train)
print(f'Best model accuracy for Random Search Example: {best_model.score(X_test, Y_test)}')


# ----------------------------Halving Random Search Example--------------------------------
# ----------------------------Hyperparameter Space--------------------------------
h_space = {'bootstrap': [True, False],
           'criterion': ['gini', 'entropy'],
           'max_features': sp_rand(20,30),
           'min_samples_split': sp_rand(2,11),
           'min_samples_leaf': sp_rand(2,11),
           'n_estimators': sp_rand(30, 100)}
start = datetime.now()
models = HalvingRandomSearchCV(Random_forest_clf, param_distributions=h_space, cv = 5, random_state=42)
models.fit(X_train, Y_train)
end = datetime.now()
#Getting 5-fold cross validated score
scores = models.cv_results_['mean_test_score'] 
# Getting best hyperparameters
best_hparams = models.best_params_

print(f'Duration for Halving Random Search: {end-start}')
print(f'Best model training score for Halving Random Search: {max(scores)}')
print(f'Best hyperparameters for Halving Random Search: {best_hparams}')

# ----------------------------Training the model--------------------------------
# Training the model with the best hyperparameters from the Halving Random Search
best_model = RandomForestClassifier(bootstrap= True,
                                    criterion = 'entropy',
                                    max_features = 24,
                                    min_samples_leaf = 3,
                                    min_samples_split= 9,
                                    n_estimators= 59)
best_model.fit(X_train, Y_train)
print(f'Best model accuracy for Halving Random Search: {best_model.score(X_test, Y_test)}')

Model 1 accuracy: 0.9590643274853801
Model 2 accuracy: 0.9590643274853801
Model 3 accuracy: 0.9707602339181286
Duration for Grid Search Example: 0:02:37.439434
Best model training score for Grid Search Example: 0.954778481012658
Best hyperparameters for Grid Search Example: {'bootstrap': True, 'criterion': 'gini', 'max_features': 10, 'min_samples_leaf': 5, 'min_samples_split': 11, 'n_estimators': 30}
Best model accuracy for Grid Search Example: 0.9707602339181286
Duration for Halving Grid Search: 0:01:08.292767
 Best model training score for Halving Grid Search: 1.0
Best hyperparameters for Halving Grid Search: {'bootstrap': False, 'criterion': 'entropy', 'max_features': 10, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}
Best model accuracy for Halving Grid Search: 0.9707602339181286
Duration for Random Search Example: 0:00:07.923801
Best model training score for Random Search Example: 0.9497784810126582
Best hyperparameters for Random Search Example: {'bootstrap':