# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [155]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [156]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [157]:
y = bioresponce.Activity.values

In [158]:
X = bioresponce.iloc[:, 1:]

In [159]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [160]:
from sklearn.linear_model import LogisticRegression

In [161]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [162]:
type(preds)

numpy.ndarray

In [163]:
10 // 9

1

In [164]:
print(sum(preds == y_test) / len(preds))

0.75605815832


In [165]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [166]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [167]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [168]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [169]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [170]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.718901453958 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.698707592892 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
0.739903069467 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.789176090468 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_f

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

In [171]:
import numpy as np
from sklearn import cross_validation, grid_search

In [172]:
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=10, test_size=0.2, random_state=0)

In [173]:
def grid_search_CV(classifier, parameters_grid, X_train, y_train):
    cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=10, test_size=0.2, random_state=0)
    grid_cv = grid_search.GridSearchCV(classifier, parameters_grid, scoring='accuracy', cv=cv)
    
    print(str(classifier), '\n')
    
    print("GridSearchCV")
    
    %time grid_cv.fit(X_train, y_train)
    
    print("Best score = {}".format(grid_cv.best_score_))
    print("Best params = {}".format(grid_cv.best_params_))
    
    randomized_grid_cv = grid_search.RandomizedSearchCV(
        classifier,
        parameters_grid,
        scoring='accuracy',
        cv=cv, n_iter = 10,
        random_state = 0
    )
    print("RandomizedSearchCV")
          
    %time randomized_grid_cv.fit(X_train, y_train)
    
    print("Best score = {}".format(randomized_grid_cv.best_score_))
    print("Best params = {}".format(randomized_grid_cv.best_params_))
    

In [174]:
parameters_grid_kneigh = {
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'p': [1, 2],
    'n_neighbors': np.arange(4, 8, 1),
    'weights': ['uniform', 'distance']
}

parameters_grid_decision_tree = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : np.arange(1, 6),
    'min_samples_split' : np.arange(3, 5),
    'min_samples_leaf' : np.arange(2, 4),
    'max_features' : ['log2', 'sqrt']
}

parameters_grid_linearSVC = {
    'multi_class' : ['ovr', 'crammer_singer'],
    'fit_intercept' : ['True', 'False'],
    'max_iter' : np.arange(750, 1501, 250)
}

parameters_grid_random_forest = {
    'n_estimators' : np.arange(100, 201, 50),
    'criterion' : ['gini', 'entropy'],
    'max_features' : ['sqrt', 'log2'],
    'bootstrap' : ['True', 'False'],
    'min_samples_split' : np.arange(3, 5),
    'min_samples_leaf' : np.arange(2, 4)
}

parameters_grid_gradientboosting = {
    'n_estimators' : np.arange(100, 201, 50),
    'loss' : ['deviance', 'exponential'],
    'max_depth' : np.arange(3, 6),
    'max_features' : ['sqrt', 'log2'],
    'criterion' : ['friedman_mse', 'mse']
}


In [175]:
parameters_grids = [
    parameters_grid_kneigh,
    parameters_grid_decision_tree,
    parameters_grid_linearSVC,
    parameters_grid_random_forest,
    parameters_grid_gradientboosting
]

In [176]:
classifiers_with_grids = (
    (KNeighborsClassifier(), parameters_grid_kneigh),
    (DecisionTreeClassifier(),parameters_grid_decision_tree),
    (LinearSVC(), parameters_grid_linearSVC),
    (RandomForestClassifier(), parameters_grid_random_forest),
    (GradientBoostingClassifier(), parameters_grid_gradientboosting)
)

In [177]:
for classifier_with_grids in classifiers_with_grids:
    grid_search_CV(classifier_with_grids[0], classifier_with_grids[1], X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

GridSearchCV
CPU times: user 23min 58s, sys: 4.7 s, total: 24min 3s
Wall time: 24min 3s
Best score = 0.757455268389662
Best params = {'algorithm': 'auto', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
RandomizedSearchCV
CPU times: user 5min 2s, sys: 1.21 s, total: 5min 3s
Wall time: 5min 3s
Best score = 0.757455268389662
Best params = {'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'algorithm': 'ball_tree'}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best') 

GridSearchCV
CPU times: user 27.4 s, sys: 0 ns, total: 27.4 s
Wall time: 27.4 s
Best score = 0.685487077

**GridSearch** лучше всего подходит для небольших выборок, так как проходит по всей сетке, а **RandomizedSearch** подходит для выборок любого размера, посколько выбирает случайным образом n наборов параметров из заданной сетки.

Так же **RandomizedSearch** на порядок быстрее, ввиду меньшей выборки, в то же время точность меняется всего лишь во 2-3 порядке

Таким образом при ограниченных ресурсах и нежестких требованиях к точности целесообразно использовать **RandomizedSearch**