# Języki Programowania Python i R


## dr inż. Patryk Jasik
### Division of Theoretical Physics and Quantum Information
### Institute of Physics and Computer Science
### Faculty of Applied Physics and Mathematics
### Gdansk University of Technology

# scikit-learn docs
## https://scikit-learn.org/stable/

# hyperparameters optimization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.model_selection

#%config Completer.use_jedi = False

In [2]:
wine = pd.read_csv("data/winequality-all.csv", comment="#")
wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,3,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,3,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,3,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,4,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,3,red


In [3]:
wine["quality"] = pd.cut(wine["response"],
                         [0, 5, 10],
                         right=False,
                         labels=["bad", "good"])

In [4]:
X = wine.iloc[:, 0:11]
X.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4


In [5]:
y = wine["quality"]

In [6]:
yk = y.cat.codes.values

In [7]:
m = X.mean()
s = X.std()
X_std = (X - m)/s

In [8]:
idx_train, idx_test =\
sklearn.model_selection.train_test_split(np.arange(X_std.shape[0]), 
                                        test_size=0.2,
                                        random_state=12345)

In [9]:
X_train, X_test = X_std.iloc[idx_train, :], X_std.iloc[idx_test, :]
yk_train, yk_test = yk[idx_train], yk[idx_test]

X_train.shape, X_test.shape, yk_train.shape, yk_test.shape

((4256, 11), (1064, 11), (4256,), (1064,))

In [10]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [11]:
# we will create classification model based on Support Vector Machine method
# https://en.wikipedia.org/wiki/Support-vector_machine
class_model = svm.SVC(max_iter=10000)

In [12]:
help(svm.SVC)

Help on class SVC in module sklearn.svm._classes:

class SVC(sklearn.svm._base.BaseSVC)
 |  SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time scales at least
 |  quadratically with the number of samples and may be impractical
 |  beyond tens of thousands of samples. For large datasets
 |  consider using :class:`~sklearn.svm.LinearSVC` or
 |  :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
 |  :class:`~sklearn.kernel_approximation.Nystroem` transformer or
 |  other :ref:`kernel_approximation`.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `g

In [13]:
#hyperparameters of the model
class_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': 10000,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [15]:
# let's create the grid with the hyperparameters
search_grid = [
   {'C': [0.1, 1, 2, 5, 10, 20], 'kernel': ['linear', 'rbf', 'poly']}
   #,{'C': [10, 100], 'kernel': ['linear', 'poly']}
 ]

In [16]:
# names of available metrics
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weig

In [18]:
# we will evaluate models on accuracy and F1 metrics
scorer = ['accuracy', 'f1']

In [19]:
# creation of the GridSearch object
search_func = GridSearchCV(estimator=class_model,
                           param_grid=search_grid,
                           scoring=scorer,
                           refit='f1',
                           cv=5)

In [20]:
# training the models on the grid
search_func.fit(X_train, yk_train)



In [21]:
# Results
print(search_func.best_estimator_)
print(search_func.best_params_)
print(search_func.best_score_)

SVC(C=20, max_iter=10000)
{'C': 20, 'kernel': 'rbf'}
0.45916915928940105


In [22]:
# creation the DataFrame with results
results = pd.DataFrame(search_func.cv_results_)

In [23]:
results
#.iloc[:2,:]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0.182419,0.026415,0.034746,0.001253,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.807512,0.807286,0.807286,...,9e-05,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
1,0.291908,0.024366,0.077121,0.009009,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.808685,0.80611,0.808461,...,0.001219,13,0.012121,0.011976,0.012121,0.024096,0.0,0.012063,0.00762,15
2,0.282681,0.032815,0.04114,0.004266,0.1,poly,"{'C': 0.1, 'kernel': 'poly'}",0.81338,0.804935,0.819036,...,0.005479,12,0.121547,0.087912,0.144444,0.123596,0.145251,0.12455,0.020864,14
3,0.26024,0.043708,0.041287,0.008295,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.807512,0.807286,0.807286,...,9e-05,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
4,0.283054,0.01055,0.069214,0.002555,1.0,rbf,"{'C': 1, 'kernel': 'rbf'}",0.827465,0.822562,0.839013,...,0.009638,1,0.363636,0.340611,0.38009,0.428571,0.462185,0.395019,0.044283,8
5,0.337123,0.058482,0.044325,0.006078,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.823944,0.815511,0.822562,...,0.006745,10,0.311927,0.262911,0.270531,0.319249,0.313725,0.295669,0.02388,12
6,0.311171,0.036729,0.042308,0.006527,2.0,linear,"{'C': 2, 'kernel': 'linear'}",0.807512,0.807286,0.807286,...,9e-05,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
7,0.297286,0.008372,0.070012,0.002359,2.0,rbf,"{'C': 2, 'kernel': 'rbf'}",0.82277,0.821387,0.836663,...,0.008983,4,0.357447,0.361345,0.403433,0.436214,0.452675,0.402223,0.038414,6
8,0.330212,0.011283,0.040647,0.002875,2.0,poly,"{'C': 2, 'kernel': 'poly'}",0.828638,0.817861,0.823737,...,0.006571,9,0.359649,0.298643,0.305556,0.353982,0.361111,0.335788,0.027696,11
9,0.367854,0.019283,0.040992,0.005863,5.0,linear,"{'C': 5, 'kernel': 'linear'}",0.816901,0.816686,0.828437,...,0.004327,11,0.235294,0.297297,0.41129,0.222222,0.163043,0.265829,0.084318,13
