# Chapter04 模型选择与评估

## 4.1 数据集划分方法

- K折交叉验证
    - KFold
    - GroupKFold
    - StratifiedKFold(分层K折交叉验证)
- 留一法
    - LeaveOneOut(LOO)
    - LeaveOneGroupOut
    - LeavePOut
    - LeavePGroupsOut
- 随机划分
    - ShuffleSplit
    - GroupShuffleSplit
    - StratifiedShuffleSplit(分层随机划分)

In [5]:
# KFold

# import packages
import numpy as np
from sklearn.model_selection import KFold

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 3, 4, 5, 6])

# KFold
# n_splits:几折？折成几段？
# random_state:随机数种子，应该是跟随shuffle参数，如果shuffle=False,则无需随机数种子
# shuffle:是否随机？如果shuffle=True,则先把数据集打乱顺序，然后再折
kf = KFold(n_splits = 2)
kf.get_n_splits(X)
print(kf)

# 划分数据集，打印结果
for train_index, test_index in kf.split(X):
    print('Train_index:', train_index, ", Test_index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# 注意“折”字的含义，就是对折，意味着数据集是对折的

KFold(n_splits=2, random_state=None, shuffle=True)
Train_index: [0 3 5] , Test_index: [1 2 4]
Train_index: [1 2 4] , Test_index: [0 3 5]


In [12]:
# GroupKFold
# 设置groups，同一个groups的样本划分到同一个fold中，不能拆开

from sklearn.model_selection import GroupKFold

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 3, 4, 5, 6])

groups = np.array([1, 2, 3, 4, 5, 6])
group_kf = GroupKFold(n_splits = 2)
group_kf.get_n_splits(X, y, groups)
print(group_kf)

# 划分数据集，打印结果
for train_index, test_index in group_kf.split(X, y, groups):
    print('Train_index:', train_index, ", Test_index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


GroupKFold(n_splits=2)
Train_index: [2 3] , Test_index: [0 1 4 5]
Train_index: [0 1 4 5] , Test_index: [2 3]


In [15]:
# StratifiedKFold(分层K折交叉验证)
# 根据目标变量的取值分布，进行分层划分，保证训练集测试集的分布与总体一致

from sklearn.model_selection import StratifiedKFold

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 1, 1, 2, 2, 2])
# skf
skf = StratifiedKFold(n_splits = 3)
skf.get_n_splits(X, y)
print(skf)
# 划分数据集，打印结果：
for train_index, test_index in skf.split(X, y):
    print("Train_index:", train_index, ", Test_index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# 其实分层K折，就是在每一层进行K折，且层与层之间是对应的，然后生成对应的样本集合


StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
Train_index: [1 2 4 5] , Test_index: [0 3]
Train_index: [0 2 3 5] , Test_index: [1 4]
Train_index: [0 1 3 4] , Test_index: [2 5]


In [17]:
# LeaveOneOut

from sklearn.model_selection import LeaveOneOut

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 3, 4, 5, 6])

loo = LeaveOneOut()
loo.get_n_splits(X)
print(loo)

# 划分数据集，打印结果
for train_index, test_index in loo.split(X, y):
    print("Train_index:", train_index, ", Test_index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# 没啥好解释的，特别好懂，而且实际上并不太会用到，计算成本太高了！


LeaveOneOut()
Train_index: [1 2 3 4 5] , Test_index: [0]
Train_index: [0 2 3 4 5] , Test_index: [1]
Train_index: [0 1 3 4 5] , Test_index: [2]
Train_index: [0 1 2 4 5] , Test_index: [3]
Train_index: [0 1 2 3 5] , Test_index: [4]
Train_index: [0 1 2 3 4] , Test_index: [5]


In [19]:
# LeavePOut

from sklearn.model_selection import LeavePOut

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 3, 4, 5, 6])

lpo = LeavePOut(p = 3)
lpo.get_n_splits(X)
print(lpo)

for train_index, test_index in lpo.split(X, y):
    print("Train_index:", train_index, "Test_index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
# 留一法的变体，留出p个样本作为测试集，至于是哪p个样本，那就是用排列组合方式取得了


LeavePOut(p=3)
Train_index: [3 4 5] Test_index: [0 1 2]
Train_index: [2 4 5] Test_index: [0 1 3]
Train_index: [2 3 5] Test_index: [0 1 4]
Train_index: [2 3 4] Test_index: [0 1 5]
Train_index: [1 4 5] Test_index: [0 2 3]
Train_index: [1 3 5] Test_index: [0 2 4]
Train_index: [1 3 4] Test_index: [0 2 5]
Train_index: [1 2 5] Test_index: [0 3 4]
Train_index: [1 2 4] Test_index: [0 3 5]
Train_index: [1 2 3] Test_index: [0 4 5]
Train_index: [0 4 5] Test_index: [1 2 3]
Train_index: [0 3 5] Test_index: [1 2 4]
Train_index: [0 3 4] Test_index: [1 2 5]
Train_index: [0 2 5] Test_index: [1 3 4]
Train_index: [0 2 4] Test_index: [1 3 5]
Train_index: [0 2 3] Test_index: [1 4 5]
Train_index: [0 1 5] Test_index: [2 3 4]
Train_index: [0 1 4] Test_index: [2 3 5]
Train_index: [0 1 3] Test_index: [2 4 5]
Train_index: [0 1 2] Test_index: [3 4 5]


In [30]:
# ShuffleSplit

from sklearn.model_selection import ShuffleSplit

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 1, 1, 2, 2, 2])

ss = ShuffleSplit(n_splits = 3, 
                  test_size = 0.25, 
                  random_state = 0)
ss.get_n_splits(X)
print(ss)

# 拆分数据集，打印结果
for train_index, test_index in ss.split(X, y):
    print("Train_index:", train_index, ", Test_index:", test_index)

print('=============================================')
ss = ShuffleSplit(n_splits = 3, 
                  test_size = 0.25, 
                  train_size = 0.5,   # train_size + test_size <= 1，可以小于1
                  random_state = 0)
print(ss)
for train_index, test_index in ss.split(X, y):
    print("Train_index:", train_index, ", Test_index:", test_index)


ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
Train_index: [1 3 0 4] , Test_index: [5 2]
Train_index: [4 0 2 5] , Test_index: [1 3]
Train_index: [1 2 4 0] , Test_index: [3 5]
ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=0.5)
Train_index: [1 3 0] , Test_index: [5 2]
Train_index: [4 0 2] , Test_index: [1 3]
Train_index: [1 2 4] , Test_index: [3 5]


In [27]:
# StratifiedShuffleSplit

from sklearn.model_selection import StratifiedShuffleSplit

# 生成数据集
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 1, 1, 2, 2, 2])

sss = StratifiedShuffleSplit(n_splits = 5, 
                             test_size = 0.25, 
                             random_state = 0)
print(sss)

for train_index, test_index in sss.split(X, y):
    print("Train_index:", train_index, "Test_index:", test_index)

# 分层随机比分层K折来得更随意，如果K折的shuffle=True，那么就跟随机没啥差别了

StratifiedShuffleSplit(n_splits=5, random_state=0, test_size=0.25,
            train_size=None)
Train_index: [2 5 1 3] Test_index: [0 4]
Train_index: [0 4 3 1] Test_index: [2 5]
Train_index: [0 4 3 1] Test_index: [2 5]
Train_index: [5 4 1 2] Test_index: [0 3]
Train_index: [1 5 2 4] Test_index: [0 3]
