# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets
import pandas as pd
import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [3]:
iris = datasets.load_iris()

In [6]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [7]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [8]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [9]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[5.8 2.7 5.1 1.9]
 [5.3 3.7 1.5 0.2]
 [5.8 2.6 4.  1.2]
 [5.1 3.7 1.5 0.4]
 [5.5 2.4 3.8 1.1]]


Тестовая выборка:
 [[5.2 3.4 1.4 0.2]
 [7.1 3.  5.9 2.1]
 [7.6 3.  6.6 2.1]
 [7.2 3.6 6.1 2.5]
 [6.3 2.7 4.9 1.8]]


In [7]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [1 0 0 2 1 2 1 2 1 0 2 2 2 0 0 1 0 1 1 0 2 1 0 2 2 0 2 1 1 0 0 0 0 2 0 0 1
 2 2 1 1 0 2 2 2 1 1 2 0 0 0 1 2 0 2 1 1 0 1 0 2 1 2 1 2 1 2 2 0 2 0 1 0 1
 1 1 0 2 2 1 1 0 1 2 0 2 1 1 0 0 2 2 1 1 0 1 1 0 0 1 2 2 2 2 1]


Метки классов на тестовой выборке:
 [1 2 1 0 2 0 0 1 0 1 2 0 2 1 0 2 0 2 2 0 2 2 2 0 0 1 2 1 2 0 0 1 0 1 2 1 0
 0 1 0 0 2 2 1 1]


### Стратегии проведения кросс-валидации

In [10]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,10)
pd_iris = pd.DataFrame(iris.data, columns= iris.feature_names)

#### KFold

In [14]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [19]:
kf = model_selection.KFold(n_splits = 5, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[0 1 2 3 4 7 8 9] [5 6]
[0 1 3 4 5 6 8 9] [2 7]
[0 2 4 5 6 7 8 9] [1 3]
[1 2 3 5 6 7 8 9] [0 4]
[0 1 2 3 4 5 6 7] [8 9]


In [21]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [25]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices, y[test_indices])

[0 0 0 0 0 1 1 1 1 1]
[0 1 3 4 5 6 8 9] [2 7] [0 1]
[1 2 3 4 6 7 8 9] [0 5] [0 1]
[0 2 3 4 5 7 8 9] [1 6] [0 1]
[0 1 2 4 5 6 7 9] [3 8] [0 1]
[0 1 2 3 5 6 7 8] [4 9] [0 1]


In [52]:
for train_indices, test_indices in skf.split(iris.data, iris.target):
    print(train_indices, test_indices)

[  0   1   3   5   6   7   8   9  12  13  14  15  16  17  18  19  20  21
  23  24  25  26  29  30  32  33  34  35  36  37  39  40  42  43  44  45
  46  47  48  49  50  51  53  55  56  57  58  59  62  63  64  65  66  67
  68  69  70  71  73  74  75  76  79  80  82  83  84  85  86  87  89  90
  92  93  94  95  96  97  98  99 100 101 103 105 106 107 108 109 112 113
 114 115 116 117 118 119 120 121 123 124 125 126 129 130 132 133 134 135
 136 137 139 140 142 143 144 145 146 147 148 149] [  2   4  10  11  22  27  28  31  38  41  52  54  60  61  72  77  78  81
  88  91 102 104 110 111 122 127 128 131 138 141]
[  0   1   2   3   4   5   6   8   9  10  11  12  13  15  16  17  19  20
  21  22  23  24  25  27  28  30  31  32  36  37  38  39  40  41  42  43
  44  46  47  49  50  51  52  53  54  55  56  58  59  60  61  62  63  65
  66  67  69  70  71  72  73  74  75  77  78  80  81  82  86  87  88  89
  90  91  92  93  94  96  97  99 100 101 102 103 104 105 106 108 109 110
 111 112 113 115 116 117

In [61]:
x_train = iris.data[train_indices]
y_train = iris.target[train_indices]

In [63]:
len(x_train) == len(y_train)

True

In [67]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[0 4 5 9] [1 2 3 6 7 8]
[1 2 3 6 7 8] [0 4 5 9]


#### ShuffleSplit

In [68]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)

[2 6 8 0 3 4 1 7] [9 5]
[8 9 5 2 1 3 0 7] [6 4]
[3 5 6 0 1 9 7 2] [4 8]
[5 9 2 3 4 8 6 1] [0 7]
[8 7 1 0 5 2 4 9] [6 3]
[9 2 0 5 1 4 8 3] [6 7]
[5 3 8 4 0 2 9 7] [1 6]
[6 8 4 0 2 9 7 5] [3 1]
[3 8 7 0 4 6 2 1] [5 9]
[2 4 6 0 8 3 9 5] [1 7]


#### StratifiedShuffleSplit

In [28]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[8 0 5 3 4 7 9 2] [1 6]
[7 1 4 5 2 6 9 3] [8 0]
[6 3 8 7 4 2 0 5] [1 9]
[0 2 1 6 3 5 8 9] [4 7]


#### Leave-One-Out

In [72]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators