### Training models on its default hyperparameters values using Cross-Validation

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')

In [3]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
from sklearn.model_selection import cross_val_score,KFold
from sklearn.neighbors import KNeighborsRegressor

In [7]:
# Using KNN on our data with default parameter

knn = KNeighborsRegressor()

In [9]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(knn, X, y, cv=kfold, scoring='r2')

In [12]:
# Their are 5 Model as we have splitted data into 5 parts

scores

array([0.54016122, 0.32531094, 0.54378871, 0.39244609, 0.57928121])

In [11]:
scores.mean()

0.4761976351913221

##### The above mean value we get is on the default hyperparameter of KNN. But as a Data Scientist, i will have to find those hypermeters values on which model will give best accuracy. So we do HYPERPARAMETER TUNING.

#### GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
knn = KNeighborsRegressor()

In [16]:
# Parameter Space - 4D Grid Formation - Their is 9*2*3*2=108 - 108*5(CV)=540 - Unique Combination.

param_grid = {
    'n_neighbors':[1,3,5,7,10,12,15,17,20],
    'weights':['uniform','distance'],
    'algorithm':['ball_tree', 'kd_tree', 'brute'],
    'p':[1,2]
}

In [22]:
# Using GridSearchCV
# Refit = True : Refit an estimator(ML Model) using the best found parameters on the whole dataset.
# Verbose - Is is used for displaying message.

gcv = GridSearchCV(knn, param_grid, scoring='r2', refit=True, cv=kfold, verbose=2)

In [23]:
gcv.fit(X,y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=2, weights=uniform; total

In [24]:
gcv.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [25]:
gcv.best_score_

0.6117139367845081

In [27]:
gcv.cv_results_

{'mean_fit_time': array([0.00472636, 0.00344663, 0.0041707 , 0.00380864, 0.00404119,
        0.00341983, 0.00370827, 0.00368848, 0.0034646 , 0.00341506,
        0.00365949, 0.0032516 , 0.00336332, 0.00373406, 0.00354815,
        0.00363951, 0.00279436, 0.00330415, 0.0031127 , 0.00389156,
        0.00360947, 0.00300817, 0.00274749, 0.00266981, 0.00344238,
        0.0033989 , 0.0053689 , 0.00234342, 0.00332141, 0.00315905,
        0.0034256 , 0.00365944, 0.00358706, 0.00322695, 0.00319743,
        0.00340295, 0.00359559, 0.0036375 , 0.00368986, 0.00283737,
        0.00349894, 0.00337944, 0.0033195 , 0.0034781 , 0.00338869,
        0.00325637, 0.00339146, 0.00383453, 0.00328407, 0.00448895,
        0.00257988, 0.00307903, 0.00321279, 0.00332718, 0.00336699,
        0.00465336, 0.003302  , 0.00350847, 0.0032887 , 0.00402956,
        0.00333347, 0.00397015, 0.00337248, 0.00357084, 0.00327516,
        0.00328474, 0.00350399, 0.00365562, 0.00336084, 0.00453324,
        0.00363169, 0.00331955,

In [26]:
# To view result in a table

pd.DataFrame(gcv.cv_results_)[['param_algorithm',	'param_n_neighbors',	'param_p', 'param_weights', 'mean_test_score']].sort_values('mean_test_score',ascending=False)

Unnamed: 0,param_algorithm,param_n_neighbors,param_p,param_weights,mean_test_score
81,brute,5,1,distance,0.611714
45,kd_tree,5,1,distance,0.611714
9,ball_tree,5,1,distance,0.611714
49,kd_tree,7,1,distance,0.605716
85,brute,7,1,distance,0.605716
...,...,...,...,...,...
38,kd_tree,1,2,uniform,0.331522
2,ball_tree,1,2,uniform,0.331522
75,brute,1,2,distance,0.331522
39,kd_tree,1,2,distance,0.331522


#### RandomizedSearchCV

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [29]:
rcv = RandomizedSearchCV(knn, param_grid, scoring='r2', refit=True, cv=kfold, verbose=2)

In [30]:
rcv.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END algorithm=brute, n_neighbors=17, p=2, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=17, p=2, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=17, p=2, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=17, p=2, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=17, p=2, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=17, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=17, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=17, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=17, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=17, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance; total time=   0.0s
[CV] END algo

In [31]:
rcv.best_score_

0.5637598768507459

In [32]:
rcv.best_params_

{'weights': 'distance', 'p': 1, 'n_neighbors': 15, 'algorithm': 'kd_tree'}