# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [1]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [2]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
y = bioresponce.Activity.values

In [4]:
X = bioresponce.iloc[:, 1:]

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [10]:
type(preds)

numpy.ndarray

In [11]:
10 // 9

1

In [12]:
print(sum(preds == y_test) / len(preds))

0


In [13]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [14]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [15]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [16]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV


In [18]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

(0.71890145395799676, KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))
(0.71647819063004847, DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))
(0.74313408723747976, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))
(0.78190630048465271, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

In [24]:
import numpy as np
clf_knb = KNeighborsClassifier()
clf_knb.get_params().values()

[5, 1, 'auto', 'minkowski', None, 2, 'uniform', 30]

In [25]:
clf_knb.get_params().keys()

['n_neighbors',
 'n_jobs',
 'algorithm',
 'metric',
 'metric_params',
 'p',
 'weights',
 'leaf_size']

In [26]:
parameters_grid_knb = {'n_neighbors':np.arange(3,6), 'algorithm':['auto', 'ball_tree', 'kd_tree'], 'p':np.arange(1,3)}

In [27]:
clf_dst = DecisionTreeClassifier()
clf_dst.get_params().keys()

['presort',
 'splitter',
 'min_impurity_decrease',
 'max_leaf_nodes',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'max_depth',
 'class_weight']

In [28]:
param_grid_dst = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }


In [29]:
clf_svc = LinearSVC()
clf_svc.get_params().keys()

['loss',
 'C',
 'verbose',
 'intercept_scaling',
 'fit_intercept',
 'max_iter',
 'penalty',
 'multi_class',
 'random_state',
 'dual',
 'tol',
 'class_weight']

In [31]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid_svc = {'C': Cs, 'gamma' : gammas}

In [35]:
clf_rfc = RandomForestClassifier()
clf_rfc.get_params().keys()

['warm_start',
 'oob_score',
 'n_jobs',
 'min_impurity_decrease',
 'verbose',
 'max_leaf_nodes',
 'bootstrap',
 'min_samples_leaf',
 'n_estimators',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'max_depth',
 'class_weight']

In [36]:
param_grid_rfc = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [37]:
#grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)

In [38]:
clf_gbc = GradientBoostingClassifier()
clf_gbc.get_params().keys()

['presort',
 'loss',
 'min_impurity_decrease',
 'verbose',
 'subsample',
 'max_leaf_nodes',
 'learning_rate',
 'warm_start',
 'min_samples_leaf',
 'n_estimators',
 'min_samples_split',
 'init',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'max_depth']

In [43]:
  param_grid_gbc = {'n_estimators': [10000],
                  'min_samples_leaf': [7, 9, 13],
                  'max_depth': [4, 5, 6, 7],
                  'max_features': [100, 150, 250],
                  'learn_rate': [0.05, 0.02, 0.01],
                  }

In [48]:
cv = StratifiedShuffleSplit(y_train,n_iter = 10, test_size=0.2, random_state = 0)

In [51]:
grid_knb = GridSearchCV(clf_knb, parameters_grid_knb, scoring='accuracy', cv=cv)

In [53]:
grid_dst = GridSearchCV(clf_dst, param_grid_dst, scoring='accuracy', cv=cv)

In [54]:
grid_svc = GridSearchCV(clf_svc, param_grid_svc, scoring='accuracy', cv=cv)

In [58]:
grid_rfc = GridSearchCV(clf_rf, param_grid_rfc, scoring='accuracy', cv=cv)

In [59]:
grid_gbc = GridSearchCV(clf_gbc, param_grid_gbc, scoring='accuracy', cv=cv)

In [60]:
%%time 
grid_knb.fit(X_train, y_train)
grid_dst.fit(X_train, y_train)
grid_svc.fit(X_train, y_train)
grid_rvc.fit(X_train, y_train)
grid_gbc.fit(X_train, y_train)

CPU times: user 14min 48s, sys: 3.49 s, total: 14min 52s
Wall time: 14min 52s


GridSearchCV(cv=StratifiedShuffleSplit(labels=[0 1 ..., 0 1], n_iter=10, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([3, 4, 5]), 'algorithm': ['auto', 'ball_tree', 'kd_tree'], 'p': array([1, 2])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)