# Hyperparameter Optimization Example



SPORF example using car dataset

In [1]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import openml
from rerf.rerfClassifier import rerfClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
def hyperparameter_optimization_rand(X,y,*argv):
    
    clf_best_params = {}
    clf_best_scores = {}
    for clf, params in argv:

        # run randomized search     
        n_iter_search = 10
        random_search = RandomizedSearchCV(clf, param_distributions=params,
                                           n_iter=n_iter_search, cv=10, iid=False)
        random_search.fit(X, y)       
        clf_best_params[clf] = random_search.best_params_  
        clf_best_scores[clf] = random_search.best_score_
    return clf_best_params, clf_best_scores

def hyperparameter_optimization_grid(X, y, *argv):    
    clf_best_params = {}
    clf_best_scores = {}

    # Iterate over all (classifier, hyperparameters) pairs
    for clf, params in argv:

        # Run grid search
        grid_search = GridSearchCV(
            clf, param_grid=params, cv=10, iid=False
        )
        grid_search.fit(X, y)

        # Save results
        clf_best_params[clf] = grid_search.best_params_
        clf_best_scores[clf] = grid_search.best_score_

    return clf_best_params, clf_best_score        
        

In [3]:
from scipy.stats import randint as sp_randint
import math
from math import log

# get some data
task_id = 146821 #car
openml.config.apikey = 'c9ea8896542dd998ea42685f14e2bc14'
benchmark_suite = openml.study.get_suite('OpenML-CC18')
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
n_features = np.shape(X)[1]
n_samples = np.shape(X)[0]


# build a classifier
rerf = rerfClassifier(n_estimators=100)

#specify max_depth and min_sample_splits ranges
max_depth_array = (np.unique(np.round((np.linspace(2,n_samples,
                    10))))).astype(int)
max_depth_range = np.append(max_depth_array, None)

min_sample_splits_range = (np.unique(np.round((np.arange(1,math.log(n_samples),
                            (math.log(n_samples)-2)/10))))).astype(int)

# specify parameters and distributions to sample from
rerf_param_dict = {"n_estimators": np.arange(50,550,50),
              "max_depth": max_depth_range,
              "min_samples_split": min_sample_splits_range,
              "feature_combinations": [1,2,3,4,5], 
              "max_features": ["auto", "sqrt","log2", None, n_features**2]}


In [None]:
rerf_best_params1, rerf_best_score1 = hyperparameter_optimization_rand(X, y, (rerf, rerf_param_dict))
print(rerf_best_params1)
print(rerf_best_score1)

rerf_best_params2, rerf_best_score2 = hyperparameter_optimization_rand(X, y, (rerf, rerf_param_dict))
print(rerf_best_params2)
print(rerf_best_score2)

rerf_best_params3, rerf_best_score3 = hyperparameter_optimization_rand(X, y, (rerf, rerf_param_dict))
print(rerf_best_params3)
print(rerf_best_score3)

plt.plot([1,2,3], [rerf_best_score1,rerf_best_score2, rerf_best_score3])
plt.ylabel('Score (%)')
plt.xlabel('Tuning iteration')
plt.title('Random Search model scores on letters Dataset')
plt.show()

{rerfClassifier(feature_combinations=1.5, image_height=None, image_width=None,
               max_depth=None, max_features='auto', min_samples_split=1,
               n_estimators=100, n_jobs=None, oob_score=False,
               patch_height_max=None, patch_height_min=1, patch_width_max=None,
               patch_width_min=1, projection_matrix='RerF', random_state=None): {'n_estimators': 450, 'min_samples_split': 4, 'max_features': None, 'max_depth': None, 'feature_combinations': 5}}
{rerfClassifier(feature_combinations=1.5, image_height=None, image_width=None,
               max_depth=None, max_features='auto', min_samples_split=1,
               n_estimators=100, n_jobs=None, oob_score=False,
               patch_height_max=None, patch_height_min=1, patch_width_max=None,
               patch_width_min=1, projection_matrix='RerF', random_state=None): 0.9075738470432679}


In [None]:
import matplotlib.pyplot as plt
print(rerf_best_score1.values())
plt.plot([1,2,3], [rerf_best_score1,rerf_best_score2, rerf_best_score3])
plt.ylabel('Score (%)')
plt.xlabel('Tuning iteration')
plt.title('Random Search model scores on letters Dataset')
plt.show()

In [None]:
rerf_scores = []
rerf_models = []
for i in range(0,3):
    rerf_best_params, rerf_best_score = hyperparameter_optimization_rand(X, y, (rerf, rerf_param_dict))
    print(rerf_best_params)
    rerf_models.append(rerf_best_params)
    rerf_scores.append(rerf_best_score)

plt.plot([1,2,3], rerf_scores)
plt.ylabel('Score (%)')
plt.xlabel('Tuning iteration')
plt.title('Random Search model scores on letters Dataset')
plt.show()
    

In [None]:
rerf_best_params_grid, rerf_best_score_grid = 
                        hyperparameter_optimization_grid(X, y, (rerf, rerf_param_dict))
print(rerf_best_params_grid)

In [4]:
# build a classifier
rf = RandomForestClassifier(n_estimators=100)

#specify max_depth and min_sample_splits ranges
max_depth_array = (np.unique(np.round((np.linspace(2,n_samples,
                    10))))).astype(int)
max_depth_range = np.append(max_depth_array, None)

min_sample_splits_range = (np.unique(np.round((np.arange(2,math.log(n_samples),
                            (math.log(n_samples)-2)/10))))).astype(int)

# specify parameters and distributions to sample from
rf_param_dict = {"n_estimators": np.arange(50,550,50),
              "max_depth": max_depth_range,
              "min_samples_split": min_sample_splits_range,
              "feature_combinations": [1,2,3,4,5], 
              "max_features": ["sqrt","log2", None, n_features**2]}

In [10]:
rf_best_params1, rf_best_score1 = hyperparameter_optimization_rand(X, y, (rf, rf_param_dict))
print(rerf_best_params1)

rf_best_params2, rf_best_score2 = hyperparameter_optimization_rand(X, y, (rf, rf_param_dict))
print(rerf_best_params2)

rf_best_params3, rf_best_score3 = hyperparameter_optimization_rand(X, y, (rf, rf_param_dict))
print(rerf_best_params3)

plt.plot([1,2,3], [rf_best_score1,rf_best_score2, rf_best_score3])
plt.ylabel('Score (%)')
plt.xlabel('Tuning iteration')
plt.title('Random Search RF model scores on letters Dataset')
plt.show()



ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1