# Cross-validation
## sklearn.cross_validation

In [1]:
from sklearn import datasets, model_selection
import numpy as np

In [2]:
iris=datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_data)/len(iris.data))

0.3

In [5]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[5.5 2.3 4.  1.3]
 [5.4 3.  4.5 1.5]
 [7.7 2.6 6.9 2.3]
 [6.4 3.1 5.5 1.8]
 [5.1 3.7 1.5 0.4]]


Тестовая выборка:
 [[6.1 2.9 4.7 1.4]
 [5.1 3.5 1.4 0.3]
 [4.8 3.4 1.9 0.2]
 [6.3 3.3 6.  2.5]
 [5.7 2.9 4.2 1.3]]


In [7]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [1 1 2 2 0 1 0 1 0 2 2 0 0 1 0 1 0 1 2 2 0 1 0 0 2 1 0 2 2 2 0 1 1 2 2 1 1
 2 1 0 2 0 2 1 0 1 1 2 1 1 2 1 2 1 2 0 2 0 0 2 0 0 0 1 2 1 2 2 2 0 0 0 0 2
 2 0 1 1 1 0 1 0 2 1 1 1 1 2 1 0 2 1 2 2 0 2 1 2 2 0 0 2 2 1 0]


Метки классов на тестовой выборке:
 [1 0 0 2 1 0 2 1 2 0 0 0 0 1 2 0 0 1 1 0 2 1 1 2 0 2 2 2 2 0 0 1 1 0 1 1 2
 1 0 1 0 0 2 1 2]


## Стратегии проведения кросс-валидации

### KFold

In [8]:
x=range(0,10)
kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(x):
    print(train_indices, test_indices)

[0 1 3 4 5 6 7 8] [2 9]
[0 1 2 3 5 7 8 9] [4 6]
[1 2 4 5 6 7 8 9] [0 3]
[0 2 3 4 5 6 8 9] [1 7]
[0 1 2 3 4 6 7 9] [5 8]


### StratifiedKFold

In [9]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 3,shuffle = True, random_state = 2)
for train_indices, test_indices in skf.split(x, target):
      print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[0 1 2 5 6 7] [3 4 8 9]
[0 3 4 7 8 9] [1 2 5 6]
[1 2 3 4 5 6 8 9] [0 7]


### ShuffleSplit

In [10]:
ss = model_selection.ShuffleSplit(n_splits = 2, test_size = 0.3)

for train_indices, test_indices in ss.split(x):
    print(train_indices, test_indices)

[8 0 4 5 9 7 3] [6 2 1]
[0 4 5 6 3 2 9] [1 8 7]


### StratifiedShuffleSplit

In [11]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 3, test_size = 0.3)
for train_indices, test_indices in sss.split(x, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[7 4 5 1 3 2 8] [0 9 6]
[6 5 0 9 3 8 2] [7 1 4]
[5 3 7 0 8 2 4] [1 9 6]


### Leave-One-Out

In [12]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(x):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]
