Домашнее задание:
Используя функцию sklearn.model_selection.GridSearchCV определите наилучшую комбинацию параметров для разных методов. Попробуйте также sklearn.model_selection.RandomizedSearchCV. Сделайте вывод об этих двух функциях, основываясь на полученном опыте (когда каким удобнее пользоваться и почему).

In [60]:
import pandas as pd
import numpy as np

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [61]:
y = bioresponce.Activity.values

In [62]:
X = bioresponce.iloc[:, 1:]

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [64]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Рассмотрим KNeighborsClassifier

In [65]:
model = KNeighborsClassifier()

In [66]:
parameters_neighbors = {'n_neighbors': np.arange(1, 5, 2), 
                                              'metric': ['euclidean', 'cityblock'], 
                                              'leaf_size': [10, 30], 
                                              'algorithm': ['auto', 'ball_tree']}

GridSearchCV

In [67]:
grid = GridSearchCV(model,parameters_neighbors)

In [68]:
%%time
grid.fit(X_train, y_train)

Wall time: 16min 4s


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([1, 3]), 'metric': ['euclidean', 'cityblock'], 'leaf_size': [10, 30], 'algorithm': ['auto', 'ball_tree']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Лучшая комбинация параметров:

In [69]:
print(grid.best_estimator_)
print(grid.best_score_)


KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='cityblock',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
0.730202944688


RandomizedSearchCV

In [70]:
random = RandomizedSearchCV(model,parameters_neighbors,n_iter=10)

In [71]:
%%time
random.fit(X, y)

Wall time: 22min 55s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_neighbors': array([1, 3]), 'metric': ['euclidean', 'cityblock'], 'leaf_size': [10, 30], 'algorithm': ['auto', 'ball_tree']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [72]:
print(random.best_estimator_)
print(random.best_score_)

KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
0.744868035191


Рассмотрим LinearSVC

In [50]:
model_linear = LinearSVC()

In [51]:
parameters_linear = {
                                   'C': [0.001, 0.01, 0.1, 1, 10], 
                                   'intercept_scaling': [0.001, 0.01, 0.1, 1], 
                                   'intercept_scaling': [0.1, 1], 
                                   'max_iter': [100, 1000]
}

In [52]:
grid_linear = GridSearchCV(model_linear,parameters_linear)

In [28]:
grid_linear.get_params()

{'cv': None,
 'error_score': 'raise',
 'estimator': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0),
 'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': True,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__loss': 'squared_hinge',
 'estimator__max_iter': 1000,
 'estimator__multi_class': 'ovr',
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'loss': ['squared_hinge', 'hinge'],
  'max_iter': [800, 1000, 1200],
  'multi_class': ['ovr', 'crammer_singer'],
  'penalty': ['l1', 'l2']},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

In [53]:
%%time
grid_linear.fit(X_train, y_train)

Wall time: 35.3 s


GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'max_iter': [100, 1000], 'intercept_scaling': [0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [54]:
print(grid_linear.best_estimator_)
print(grid_linear.best_score_)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=0.1, loss='squared_hinge', max_iter=100,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.74850775965


In [55]:
random = RandomizedSearchCV(model_linear,parameters_linear,n_iter=10)

In [56]:
%%time
random.fit(X, y)

Wall time: 28.7 s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': [0.001, 0.01, 0.1, 1, 10], 'max_iter': [100, 1000], 'intercept_scaling': [0.1, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [57]:
print(random.best_estimator_)
print(random.best_score_)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.763796320981


RandomizedSearchCV работает быстрее, чем GridSearchCV, но GridSearchCV работает лучше, поэтому более подходит для глубокого анализа.