# Hyperparameters tuning via Grid Search

### Importing Dataset

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Loading the Breast Cancer Wisconsin dataset

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',
header=None
)

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)


### Tuning hyperparameters via grid search
The grid search approach is quite simple: it’s a brute-force exhaustive search paradigm where we specify a list of values for different hyperparameters, and the computer evaluates the model performance for each combination to obtain the optimal combination of values from this set:

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


pipeline_svc = make_pipeline(StandardScaler(), SVC(random_state=1))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

''' The param grid is a list of dictionaries to specify the parameters that we want to tune.
 For the linear SVM, we only evaluated the inverse regularization parameter, C; for the radial basis function (RBF) kernel SVM, we tuned both the svc__C and svc__gamma parameters. Note that the svc__gamma parameter is specific to kernel SVMs.
'''

param_grid = [
    {
        'svc__C': param_range,
        'svc__kernel': ['linear']
    },
    {
        'svc__C': param_range,
        'svc__gamma': param_range,
        'svc__kernel': ['rbf']
    }
]

# Grid search use k-fold cross-validation for comparing the models with different hyperparameters
gs = GridSearchCV(
    estimator=pipeline_svc,
    param_grid=param_grid,
    scoring='accuracy',
    n_jobs=-1,
    cv=10, # Cross-Validation
    refit=True
)

gs = gs.fit(X_train, y_train)

print(f'Best Score: {gs.best_score_}')
print(f'Best Params: {gs.best_params_}')


Best Score: 0.9846859903381642
Best Params: {'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


### Estimate performance of best selected model
To estimate the performance of the best selected mode we used the indipendent test dataset 

In [8]:
classifier = gs.best_estimator_
classifier.fit(X_train, y_train)
print(f'Test accuracy: {classifier.score(X_test, y_test)}')

Test accuracy: 0.9736842105263158
