# 数据划分方法

In [3]:
import numpy as np
# hold-out
from sklearn.model_selection import train_test_split

# K折交叉验证
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# K折分布保持交叉验证
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

# 时间序列划分方法
from sklearn.model_selection import TimeSeriesSplit

# booststrap 采样
from sklearn.utils import resample

In [4]:
X = np.zeros((20, 5))
Y = np.array([1]*5 + [2]*5 + [3]*5 + [4]*5)
print(X, Y)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]] [1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4]


In [5]:
# 直接按照比例拆分 (留出法)
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size = 0.2)
print(train_y, val_y)

[4 2 2 2 2 1 3 4 1 3 4 1 2 3 3 1] [4 3 4 1]


In [6]:
# 按照比例 & 标签分布划分
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size = 0.2, stratify=Y)
print(train_y, val_y)

[4 2 2 4 4 2 1 2 3 1 4 1 1 3 3 3] [2 3 4 1]


In [7]:
# K 折交叉验证
kf = KFold(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [0 1 2 3]
Label [1 1 1 1]

[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19] [4 5 6 7]
Label [1 2 2 2]

[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19] [ 8  9 10 11]
Label [2 2 3 3]

[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19] [12 13 14 15]
Label [3 3 3 4]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] [16 17 18 19]
Label [4 4 4 4]



In [8]:
# 重复KFold方法多次
kf = RepeatedKFold(n_splits=5, n_repeats=2)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 0  1  2  3  4  7  8 10 11 12 13 14 16 17 18 19] [ 5  6  9 15]
Label [2 2 2 4]

[ 0  1  2  4  5  6  7  8  9 10 14 15 16 17 18 19] [ 3 11 12 13]
Label [1 3 3 3]

[ 0  1  2  3  5  6  7  8  9 10 11 12 13 15 16 19] [ 4 14 17 18]
Label [1 3 4 4]

[ 0  2  3  4  5  6  9 11 12 13 14 15 16 17 18 19] [ 1  7  8 10]
Label [1 2 2 3]

[ 1  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18] [ 0  2 16 19]
Label [1 1 4 4]

[ 0  1  2  3  4  5  6  9 10 11 12 14 15 17 18 19] [ 7  8 13 16]
Label [2 2 3 4]

[ 0  1  3  4  5  6  7  8 12 13 14 15 16 17 18 19] [ 2  9 10 11]
Label [1 2 3 3]

[ 0  1  2  3  4  5  7  8  9 10 11 13 14 16 17 19] [ 6 12 15 18]
Label [2 3 4 4]

[ 2  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [0 1 3 4]
Label [1 1 1 1]

[ 0  1  2  3  4  6  7  8  9 10 11 12 13 15 16 18] [ 5 14 17 19]
Label [2 3 4 4]



In [9]:
# StratifiedKFold用法类似Kfold，但是他是分层采样，确保训练集，测试集中各类别样本的比例与原始数据集中相同。
kf = StratifiedKFold(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 1  2  3  4  6  7  8  9 11 12 13 14 16 17 18 19] [ 0  5 10 15]
Label [1 2 3 4]

[ 0  2  3  4  5  7  8  9 10 12 13 14 15 17 18 19] [ 1  6 11 16]
Label [1 2 3 4]

[ 0  1  3  4  5  6  8  9 10 11 13 14 15 16 18 19] [ 2  7 12 17]
Label [1 2 3 4]

[ 0  1  2  4  5  6  7  9 10 11 12 14 15 16 17 19] [ 3  8 13 18]
Label [1 2 3 4]

[ 0  1  2  3  5  6  7  8 10 11 12 13 15 16 17 18] [ 4  9 14 19]
Label [1 2 3 4]



In [10]:
# 重复多次
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 0  1  3  4  5  6  8  9 10 11 12 14 15 16 17 19] [ 2  7 13 18]
Label [1 2 3 4]

[ 0  1  2  4  5  7  8  9 11 12 13 14 15 16 17 18] [ 3  6 10 19]
Label [1 2 3 4]

[ 0  1  2  3  5  6  7  9 10 12 13 14 15 16 18 19] [ 4  8 11 17]
Label [1 2 3 4]

[ 0  2  3  4  5  6  7  8 10 11 12 13 16 17 18 19] [ 1  9 14 15]
Label [1 2 3 4]

[ 1  2  3  4  6  7  8  9 10 11 13 14 15 17 18 19] [ 0  5 12 16]
Label [1 2 3 4]

[ 1  2  3  4  6  7  8  9 10 11 12 13 15 16 17 18] [ 0  5 14 19]
Label [1 2 3 4]

[ 0  1  2  3  5  6  7  9 10 12 13 14 16 17 18 19] [ 4  8 11 15]
Label [1 2 3 4]

[ 0  1  3  4  5  7  8  9 11 12 13 14 15 16 18 19] [ 2  6 10 17]
Label [1 2 3 4]

[ 0  1  2  4  5  6  8  9 10 11 13 14 15 16 17 19] [ 3  7 12 18]
Label [1 2 3 4]

[ 0  2  3  4  5  6  7  8 10 11 12 14 15 17 18 19] [ 1  9 13 16]
Label [1 2 3 4]



In [9]:
# test集里的数据时间上必须是在train 集数据后面的，所以基于索引（时间）顺序
kf = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[0 1 2 3 4] [5 6 7]
Label [2 2 2]

[0 1 2 3 4 5 6 7] [ 8  9 10]
Label [2 2 3]

[ 0  1  2  3  4  5  6  7  8  9 10] [11 12 13]
Label [3 3 3]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13] [14 15 16]
Label [3 4 4]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16] [17 18 19]
Label [4 4 4]



In [10]:
# 重采样
train_X, train_Y = resample(X, Y, n_samples=16)
val_X, val_Y = resample(X, Y, n_samples=4)
print(train_Y, val_Y)

[3 1 2 1 1 3 1 4 3 1 3 1 2 3 4 1] [2 2 3 3]


# 阅读链接

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://lightgbm.readthedocs.io/en/latest/Python-API.html
- https://xgboost.readthedocs.io/en/latest//python/index.html


- https://github.com/fmfn/BayesianOptimization