# Sklearn

## sklearn.cross_validation

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import cross_validation, datasets

import numpy as np



### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print 'Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print 'Обучающая выборка:\n', train_data[:5]
print '\n'
print 'Тестовая выборка:\n', test_data[:5]

Обучающая выборка:
[[7.1 3.  5.9 2.1]
 [5.1 3.5 1.4 0.2]
 [7.7 2.6 6.9 2.3]
 [4.7 3.2 1.6 0.2]
 [4.7 3.2 1.3 0.2]]


Тестовая выборка:
[[5.8 2.7 5.1 1.9]
 [6.8 3.  5.5 2.1]
 [4.8 3.1 1.6 0.2]
 [5.8 2.7 5.1 1.9]
 [6.  2.2 4.  1. ]]


In [7]:
print 'Метки классов на обучающей выборке:\n', train_labels
print '\n'
print 'Метки классов на тестовой выборке:\n', test_labels

Метки классов на обучающей выборке:
[2 0 2 0 0 0 1 0 1 2 0 0 2 1 2 1 0 0 0 2 2 2 0 2 1 0 1 1 2 1 2 2 0 0 0 0 1
 1 0 0 1 2 2 0 1 1 1 0 1 1 1 1 0 1 2 1 1 2 1 2 2 0 0 1 1 2 1 2 2 0 2 2 0 0
 2 1 2 1 2 2 2 1 1 0 0 0 0 1 0 0 0 2 0 1 0 2 2 0 1 1 0 1 1 1 0]


Метки классов на тестовой выборке:
[2 2 0 2 1 1 1 2 2 1 2 0 2 0 1 1 0 1 2 1 0 2 1 1 2 0 2 2 0 2 2 2 0 2 2 1 0
 0 2 0 1 2 0 1 1]


### Стратегии проведения кросс-валидации

#### KFold

In [8]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 5):
    print train_indices, test_indices

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [9]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True):
    print train_indices, test_indices

[0 1 3 5 6] [2 4 7 8 9]
[2 4 7 8 9] [0 1 3 5 6]


In [10]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True, random_state = 1):
    print train_indices, test_indices

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [11]:
target = np.array([0] * 5 + [1] * 5)
print target
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2, shuffle = True, random_state = 0):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [12]:
target = np.array([0, 1] * 5)
print target
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2,shuffle = True):
    print train_indices, test_indices

[0 1 0 1 0 1 0 1 0 1]
[0 4 5 9] [1 2 3 6 7 8]
[1 2 3 6 7 8] [0 4 5 9]


#### ShuffleSplit

In [16]:
for train_indices, test_indices in cross_validation.ShuffleSplit(10, n_iter = 10, test_size = 0.2):
    print train_indices, test_indices
?cross_validation.ShuffleSplit

[1 5 3 7 4 6 9 0] [8 2]
[5 1 8 2 4 7 9 0] [3 6]
[6 8 1 2 4 3 9 5] [7 0]
[9 4 5 8 6 1 0 3] [7 2]
[6 3 4 7 2 5 9 1] [0 8]
[6 4 5 0 9 1 7 8] [3 2]
[5 8 3 1 9 7 0 2] [4 6]
[0 6 9 8 5 7 1 2] [4 3]
[0 5 9 3 1 8 4 7] [6 2]
[6 9 4 0 7 3 1 2] [5 8]


[0;31mInit signature:[0m [0mcross_validation[0m[0;34m.[0m[0mShuffleSplit[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mn[0m[0;34m,[0m [0mn_iter[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m [0mtest_size[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m [0mtrain_size[0m[0;34m=[0m[0mNone[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Random permutation cross-validation iterator.

.. deprecated:: 0.18
    This module will be removed in 0.20.
    Use :class:`sklearn.model_selection.ShuffleSplit` instead.

Yields indices to split data into training and test sets.

Note: contrary to other cross-validation strategies, random splits
do not guarantee that all folds will be different, although this is
still very likely for sizeable datasets.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
n : int
    Total number of elements in the dataset.

n_iter : int (default 10)
    Number of re-shuffling & 

#### StratifiedShuffleSplit

In [14]:
target = np.array([0] * 5 + [1] * 5)
print target
for train_indices, test_indices in cross_validation.StratifiedShuffleSplit(target, n_iter = 4, test_size = 0.2):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[6 8 4 2 1 3 9 5] [7 0]
[1 6 4 7 0 8 3 9] [2 5]
[6 5 9 7 4 2 0 3] [1 8]
[3 5 6 7 2 4 8 1] [0 9]


#### Leave-One-Out

In [15]:
for train_indices, test_index in cross_validation.LeaveOneOut(10):
    print train_indices, test_index

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators