## 数据集划分方法

### k 折交叉验证及其变体

In [4]:
import  numpy as np
from sklearn.model_selection import KFold

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 1, 1, 2, 2, 2])

kf = KFold(n_splits=3)

for train_index, test_index in kf.split(data):
    print("train_index: ", train_index, "test_index: ", test_index)

train_index:  [2 3 4 5] test_index:  [0 1]
train_index:  [0 1 4 5] test_index:  [2 3]
train_index:  [0 1 2 3] test_index:  [4 5]


In [5]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 1, 1, 2, 2, 2])
# groups = np.array([1, 1, 1, 2, 2, 2])

sfk = StratifiedKFold(n_splits=3)

for train_index, test_index in sfk.split(data, target):
    print("train_index: ", train_index, "test_index: ", test_index)

train_index:  [1 2 4 5] test_index:  [0 3]
train_index:  [0 2 3 5] test_index:  [1 4]
train_index:  [0 1 3 4] test_index:  [2 5]


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 2, 3, 4, 5, 6])
groups = np.array([0, 1, 2, 4, 5, 6])

group_kfold = GroupKFold(n_splits=3)
group_kfold

for train_index, test_index in group_kfold.split(data, target, groups):
    print(pd.DataFrame(data[train_index, :]))
    print(pd.DataFrame(target[train_index]))
    print(pd.DataFrame(data[test_index, :]))
    print(pd.DataFrame(target[test_index]))
    print("------------------------------------------------")


   0   1
0  1   2
1  3   4
2  7   8
3  9  10
   0
0  1
1  2
2  4
3  5
    0   1
0   5   6
1  11  12
   0
0  3
1  6
------------------------------------------------
    0   1
0   1   2
1   5   6
2   7   8
3  11  12
   0
0  1
1  3
2  4
3  6
   0   1
0  3   4
1  9  10
   0
0  2
1  5
------------------------------------------------
    0   1
0   3   4
1   5   6
2   9  10
3  11  12
   0
0  2
1  3
2  5
3  6
   0  1
0  1  2
1  7  8
   0
0  1
1  4
------------------------------------------------


### 留1/P法及其变体

In [7]:
import numpy as np
from sklearn.model_selection import LeaveOneOut

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 2, 3, 4, 5, 6])

loo = LeaveOneOut()
loo.get_n_splits(data)

for train_index, test_index in loo.split(data):
    print("train_index: ", train_index, "test_index: ", test_index)


train_index:  [1 2 3 4 5] test_index:  [0]
train_index:  [0 2 3 4 5] test_index:  [1]
train_index:  [0 1 3 4 5] test_index:  [2]
train_index:  [0 1 2 4 5] test_index:  [3]
train_index:  [0 1 2 3 5] test_index:  [4]
train_index:  [0 1 2 3 4] test_index:  [5]


### 随机划分及其变体

In [8]:
import numpy as np
from sklearn.model_selection import ShuffleSplit

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 2, 3, 4, 5, 6])

ss = ShuffleSplit(n_splits=3
                  , test_size=1/3
                  , train_size=2/3)

ss.get_n_splits()

for train_index, test_index in ss.split(data):
    print("train_index: ", train_index, "test_index: ", test_index)


train_index:  [0 1 4 3] test_index:  [2 5]
train_index:  [2 0 5 1] test_index:  [4 3]
train_index:  [5 0 3 1] test_index:  [2 4]


## 超参数优化方法

In [3]:
import numpy as np
import pandas as pd
from time import time
from scipy.stats import randint as hp_randint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def report(results, n_top=3):
    results = pd.DataFrame(clf.cv_results_).sort_values(by = ['rank_test_score']).head(n_top)
    results = results.loc[:, ['rank_test_score', 'std_test_score', 'mean_test_score', 'params']].reset_index()
    print(results.T)



data, target = load_digits(return_X_y=True)

rf = RandomForestClassifier()

pararmeters = {
    'criterion':["gini", "entropy"],
    'max_features':hp_randint(1, 10), # 1~10 均匀分布 
    'max_depth':hp_randint(1, 4)  # 1~4 均匀分布 
}


data_train, data_test, target_train, target_test = train_test_split(data, 
                                                                    target, 
                                                                    test_size=0.33, 
                                                                    random_state=42)

clf = RandomizedSearchCV(rf,
                         pararmeters,
                         n_iter=10,
                         n_jobs=4)

# Call predict on the estimator with the best found parameters.
clf.fit(data_train, target_train)
clf.predict(data_test)

accuracy_score(target_test, clf.predict(data_test))



0.81144781144781142

## 模型验证方法

### 对每个输入点（一个超参数每个取值）产生交叉验证计算指标

In [3]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

data, target = load_digits(return_X_y=True)

# np.logspace(-10, -1, 10) 10^-10, -10^-9, -10^-8, -10^-7, -10^-6, -10^-5, -10^-4, -10^-3, -10^-2, -10^-1
for i, c in enumerate(np.logspace(-10, -1, 10), start=1):
    clf = SVC(C=c, kernel='linear')
    # 默认是 3 折 StratifiedKFold 
    print(i, cross_val_score(clf, data, target, n_jobs=4))

1 [ 0.1013289   0.10183639  0.26342282]


2 [ 0.1013289   0.10183639  0.26342282]


3 [ 0.1013289   0.10183639  0.26342282]


4 [ 0.1013289   0.10183639  0.26342282]


5 [ 0.1013289   0.10183639  0.26342282]


6 [ 0.71594684  0.7245409   0.7147651 ]


7 [ 0.93355482  0.94824708  0.91778523]


8 [ 0.94518272  0.96994992  0.93959732]


9 [ 0.93521595  0.95826377  0.93791946]


10 [ 0.93521595  0.95826377  0.93791946]


### 对每个输入点（一个超参数每个取值）产生交叉验证预测结果

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict

data, target = load_digits(return_X_y=True)

for i, c in enumerate(np.logspace(-10, -1, 10), start=1):
    clf = SVC(C=c, kernel='linear')
    # 类似前面 cross_val_score() 通过交叉验证得到 estimator（所选择模型 + 参数）在数据集 X，y 上通过 cv 得到的 scoring 指标，这里不是得到指标而是输出预测的 y_label。
    # 用 3 折交叉验证举例，
    # 用 1、2 两部分数据 train 预测 3 部分数据、
    # 用 1、3 两部分数据 train 预测 2 部分数据、
    # 用 2、3 两部分数据 train 预测 1 部分数据，最终将三部分预测结果拼接起来返回
    print(i, cross_val_predict(clf, data, target, n_jobs=4))


### 计算并绘制模型的验证曲线

In [4]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

digits = load_digits()
data, target = digits.data, digits.target

param_range = np.logspace(-10, -1, 10)
train_scores, test_scores = validation_curve(\
    SVC(kernel='linear'), data, target, param_name='C', \
    param_range=param_range, cv=3, scoring='accuracy', n_jobs=-1)

print(train_scores)
print('------------------------------------------------')
print(test_scores)

[[ 0.10209205  0.10183639  0.28726062]
 [ 0.10209205  0.10183639  0.28726062]
 [ 0.10209205  0.10183639  0.28726062]
 [ 0.10209205  0.10183639  0.28726062]
 [ 0.10209205  0.10183639  0.28726062]
 [ 0.73138075  0.73539232  0.74771024]
 [ 0.96401674  0.95993322  0.97502082]
 [ 0.98995816  0.99081803  0.99500416]
 [ 0.99832636  1.          1.        ]
 [ 1.          1.          1.        ]]
------------------------------------------------
[[ 0.1013289   0.10183639  0.26342282]
 [ 0.1013289   0.10183639  0.26342282]
 [ 0.1013289   0.10183639  0.26342282]
 [ 0.1013289   0.10183639  0.26342282]
 [ 0.1013289   0.10183639  0.26342282]
 [ 0.71594684  0.7245409   0.7147651 ]
 [ 0.93355482  0.94824708  0.91778523]
 [ 0.94518272  0.96994992  0.93959732]
 [ 0.93521595  0.95826377  0.93791946]
 [ 0.93521595  0.95826377  0.93791946]]


### 计算并绘制学习率曲线

In [34]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.datasets import load_iris
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold

# digits = load_digits()
# data, target = digits.data, digits.target

# 之前使用 iris 数据集调用 learning_curve() 函数的时候会报错
# 原因在于 learning_curve() 函数train_sizes=np.linspace(0.1, 1.0, 5) 前 0.1 等训练样本 target 都是相同的
# 所以会报 ValueError: The number of classes has to be greater than one
iris = load_iris()
data, target = iris.data, iris.target
data_target = pd.concat([pd.DataFrame(data, columns=iris.feature_names), pd.DataFrame(target, columns=['target'])], axis=1)

# 数据集乱序 
# .sample(frac=1) 抽样全部样本相当于乱序 
# .reset_index(drop=True) 删掉由于 reset_index 产生的一个新 index 列
data_target = data_target.sample(frac=1).reset_index(drop=True)


svc = SVC(gamma=0.001)

# sfk = StratifiedKFold(n_splits=3, shuffle=True)
# for train_index, test_index in sfk.split(data, target):
#     print("train_index shape: ", train_index.shape) 
#     print("test_index shape: ", test_index.shape)
#     print("------------------------------------------")
# >>>train_index shape:  (99,)
# >>>test_index shape:  (51,)
# >>>------------------------------------------
# >>>train_index shape:  (99,)
# >>>test_index shape:  (51,)
# >>>------------------------------------------
# >>>train_index shape:  (102,)
# >>>test_index shape:  (48,)
# >>>------------------------------------------
# train_sizes has been interpreted as absolute numbers of training samples and must be within (0, 99], but is within [20, 100]
# cv = StratifiedKFold(n_splits=3, random_state=0)
# estimator = SVC(gamma=0.001)
# 注意下面程序会报错 , learning_curve() 默认使用了上面 cv 形式的 StratifiedKFold , 理论上 train set 100 样本 test set 50 样本
# 但是鉴于是分层抽样 , 会使得 train set test set 数量非绝对的 100、50 此时指定 train_sizes=[20, 40, 60, 80, 100] 会产生问题 , 故直接使用默认即可
train_sizes, train_scores, test_scores = learning_curve(estimator=svc, 
                                                        X=data_target.loc[:, iris.feature_names], 
                                                        y=data_target.loc[:, 'target'], 
                                                        n_jobs=-1)

print('train_sizes: ', train_sizes)
print('train_scores: ')
print(train_scores)
print('test_scores: ')
print(test_scores)

train_sizes:  [ 9 32 54 76 99]
train_scores: 
[[ 0.55555556  0.88888889  0.88888889]
 [ 0.46875     0.40625     0.40625   ]
 [ 0.35185185  0.35185185  0.74074074]
 [ 0.63157895  0.61842105  0.39473684]
 [ 0.92929293  0.8989899   0.34343434]]
test_scores: 
[[ 0.33333333  0.66666667  0.66666667]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.66666667]
 [ 0.64705882  0.60784314  0.33333333]
 [ 0.8627451   0.92156863  0.33333333]]


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
