<a href="https://colab.research.google.com/github/Namesakenberg/machine_learning/blob/main/hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GridSearchCV , RandomizedSearchCV and Bayesian Optimization

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')


In [None]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
df.shape

(506, 14)

In [None]:
X = df.drop(columns ='medv')
y = df['medv']

In [None]:
# training a vanilla KNeighborRegressor
from sklearn.model_selection import GridSearchCV,KFold,cross_val_score
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()  # estimator
kfold = KFold(n_splits=5,shuffle=True,random_state=42) #
scores = cross_val_score(knr,X,y,scoring='r2',cv=kfold)
print("Scores : ",scores)
print("mean score : ",scores.mean())

Scores :  [0.64736409 0.61867406 0.37360847 0.5948161  0.45747325]
mean score :  0.5383871944766712


In [None]:
# Using the gridsearchCV

from sklearn.metrics import r2_score
knr = KNeighborsRegressor()  # estimator
param_grid={                  # parameter grid
    'n_neighbors':[1,3,5,7,9,10,12,15,17,20],
    'weights':['uniform','distance'],
    'algorithm':['ball_tree','kd_tree','brute'],
    'p':[1,2]    # minkowski distance 1-> manhattan  2->euclidean
}
gcv = GridSearchCV(knr,param_grid,scoring='r2',cv=kfold)
gcv.fit(X,y)


In [None]:
gcv.best_params_  # Combination which gives the best result

{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [None]:
gcv.best_score_  # best r2

np.float64(0.6434974189445056)

In [None]:
pd.DataFrame(gcv.cv_results_)[['param_algorithm','param_n_neighbors','param_p','param_weights','mean_test_score']].sort_values('mean_test_score',ascending=False)

Unnamed: 0,param_algorithm,param_n_neighbors,param_p,param_weights,mean_test_score
9,ball_tree,5,1,distance,0.643497
49,kd_tree,5,1,distance,0.643497
89,brute,5,1,distance,0.643497
53,kd_tree,7,1,distance,0.637631
93,brute,7,1,distance,0.637631
...,...,...,...,...,...
34,ball_tree,17,2,uniform,0.395704
74,kd_tree,17,2,uniform,0.395704
38,ball_tree,20,2,uniform,0.374657
78,kd_tree,20,2,uniform,0.374657


after getting the best parameters , again fit the model on using the best parameters

In [None]:
knr = KNeighborsRegressor(n_neighbors=5,algorithm='ball_tree',p=1,weights='distance')  # estimator with the best parameters
kfold = KFold(n_splits=5,shuffle=True,random_state=42) #
scores = cross_val_score(knr,X,y,scoring='r2',cv=kfold)
print("Scores : ",scores)
print("mean score : ",scores.mean())

Scores :  [0.71365278 0.70799762 0.47356882 0.74247025 0.57979761]
mean score :  0.6434974189445056


## problem with gridSearchCV  :
traning of multiple models is computationally expensive , and if the data set is really big like having 1 lakh rows , then grid search cv is not even considered to be use as there is no time to train thousands od models and to get the best parameters



#RandomizedSearchCV:
on the given number x the model will tryout random combinations of parameters on the data , hence instead of training all the possible combinations,only x number of combinations are tried


In [None]:
from sklearn.model_selection import RandomizedSearchCV
rcv = RandomizedSearchCV(knr , param_grid ,n_iter=30,scoring='r2',refit=True,cv=kfold,verbose=2)
rcv.fit(X,y)

# n _iter = number of combinations to be tried

# total models trained = n_iter * 5
# 5 = number of splits in crossvalidation


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END algorithm=kd_tree, n_neighbors=1, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=1, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=1, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=1, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=1, p=2, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=15, p=2, weights=uniform; total time=   0

In [None]:
rcv.best_score_     # in just 30 iterations we got the best score

np.float64(0.6434974189445056)

In [None]:
rcv.best_params_

{'weights': 'distance', 'p': 1, 'n_neighbors': 5, 'algorithm': 'kd_tree'}

## best way to find is bayesian optimization

use scikitops , optuna , hyperopt to do bayesiann optimization

In [None]:
pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [None]:

from skopt import BayesSearchCV
from skopt.space import Integer, Categorical


# Define model
knr = KNeighborsRegressor()

# Define KFold CV
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Define search space
search_space = {                    # similar to param_grid
    'n_neighbors': Integer(3, 20),
    'weights': Categorical(['uniform', 'distance']),
    'algorithm': Categorical(['ball_tree', 'kd_tree', 'brute']),
    'p': Integer(1, 2)  # 1 = Manhattan, 2 = Euclidean
}

# Define BayesSearchCV
opt = BayesSearchCV(
    estimator=knr,
    search_spaces=search_space,
    n_iter=30,  # Number of iterations to try
    scoring='r2',
    cv=kfold,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# Fit
opt.fit(X, y)



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
# Best results
print("Best Parameters:", opt.best_params_)
print("Best R2 Score:", opt.best_score_)


Best Parameters: OrderedDict([('algorithm', 'brute'), ('n_neighbors', 5), ('p', 1), ('weights', 'distance')])
Best R2 Score: 0.6434974189445056
