## k 折交叉验证及其变体

In [4]:
import  numpy as np
from sklearn.model_selection import KFold

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 1, 1, 2, 2, 2])

kf = KFold(n_splits=3)

for train_index, test_index in kf.split(data):
    print("train_index: ", train_index, "test_index: ", test_index)

train_index:  [2 3 4 5] test_index:  [0 1]
train_index:  [0 1 4 5] test_index:  [2 3]
train_index:  [0 1 2 3] test_index:  [4 5]


In [5]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 1, 1, 2, 2, 2])
# groups = np.array([1, 1, 1, 2, 2, 2])

sfk = StratifiedKFold(n_splits=3)

for train_index, test_index in sfk.split(data, target):
    print("train_index: ", train_index, "test_index: ", test_index)

train_index:  [1 2 4 5] test_index:  [0 3]
train_index:  [0 2 3 5] test_index:  [1 4]
train_index:  [0 1 3 4] test_index:  [2 5]


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 2, 3, 4, 5, 6])
groups = np.array([0, 1, 2, 4, 5, 6])

group_kfold = GroupKFold(n_splits=3)
group_kfold

for train_index, test_index in group_kfold.split(data, target, groups):
    print(pd.DataFrame(data[train_index, :]))
    print(pd.DataFrame(target[train_index]))
    print(pd.DataFrame(data[test_index, :]))
    print(pd.DataFrame(target[test_index]))
    print("------------------------------------------------")


   0   1
0  1   2
1  3   4
2  7   8
3  9  10
   0
0  1
1  2
2  4
3  5
    0   1
0   5   6
1  11  12
   0
0  3
1  6
------------------------------------------------
    0   1
0   1   2
1   5   6
2   7   8
3  11  12
   0
0  1
1  3
2  4
3  6
   0   1
0  3   4
1  9  10
   0
0  2
1  5
------------------------------------------------
    0   1
0   3   4
1   5   6
2   9  10
3  11  12
   0
0  2
1  3
2  5
3  6
   0  1
0  1  2
1  7  8
   0
0  1
1  4
------------------------------------------------


## 留1/P法及其变体

In [7]:
import numpy as np
from sklearn.model_selection import LeaveOneOut

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 2, 3, 4, 5, 6])

loo = LeaveOneOut()
loo.get_n_splits(data)

for train_index, test_index in loo.split(data):
    print("train_index: ", train_index, "test_index: ", test_index)


train_index:  [1 2 3 4 5] test_index:  [0]
train_index:  [0 2 3 4 5] test_index:  [1]
train_index:  [0 1 3 4 5] test_index:  [2]
train_index:  [0 1 2 4 5] test_index:  [3]
train_index:  [0 1 2 3 5] test_index:  [4]
train_index:  [0 1 2 3 4] test_index:  [5]


## 随机划分及其变体


In [8]:
import numpy as np
from sklearn.model_selection import ShuffleSplit

data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
target = np.array([1, 2, 3, 4, 5, 6])

ss = ShuffleSplit(n_splits=3
                  , test_size=1/3
                  , train_size=2/3)

ss.get_n_splits()

for train_index, test_index in ss.split(data):
    print("train_index: ", train_index, "test_index: ", test_index)


train_index:  [0 1 4 3] test_index:  [2 5]
train_index:  [2 0 5 1] test_index:  [4 3]
train_index:  [5 0 3 1] test_index:  [2 4]


## 超参数优化方法

In [3]:
import numpy as np
import pandas as pd
from time import time
from scipy.stats import randint as hp_randint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def report(results, n_top=3):
    results = pd.DataFrame(clf.cv_results_).sort_values(by = ['rank_test_score']).head(n_top)
    results = results.loc[:, ['rank_test_score', 'std_test_score', 'mean_test_score', 'params']].reset_index()
    print(results.T)



data, target = load_digits(return_X_y=True)

rf = RandomForestClassifier()

pararmeters = {
    'criterion':["gini", "entropy"],
    'max_features':hp_randint(1, 10), # 1~10 均匀分布 
    'max_depth':hp_randint(1, 4)  # 1~4 均匀分布 
}


data_train, data_test, target_train, target_test = train_test_split(data, 
                                                                    target, 
                                                                    test_size=0.33, 
                                                                    random_state=42)

clf = RandomizedSearchCV(rf,
                         pararmeters,
                         n_iter=10,
                         n_jobs=4)

# Call predict on the estimator with the best found parameters.
clf.fit(data_train, target_train)
clf.predict(data_test)

accuracy_score(target_test, clf.predict(data_test))



0.81144781144781142