#### Cross Validation
- K-Fold

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold 
from sklearn.datasets import load_iris
import numpy as np 

In [17]:
iris = load_iris()
features = iris.data
labels = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)

kfold = KFold(n_splits=5)
cv_accuracy = [] 
print(f"iris dataset size : {features.shape[0]}")
print(kfold)

iris dataset size : 150
KFold(n_splits=5, random_state=None, shuffle=False)


In [26]:
n_iter = 0 

for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test,pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print(f"# {n_iter} K Fold accurancy score {accuracy} \n#Training data size {train_size} \n#Testing data size {test_size}")
    print(f"# {n_iter} validation index \n{test_index}")
    print(f"# Training index \n{train_index}")
    
    cv_accuracy.append(accuracy)
    
print(f"average accurancy validation {np.mean(cv_accuracy)}")

# 1 K Fold accurancy score 1.0 
#Training data size 120 
#Testing data size 30
# 1 validation index 
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
# Training index 
[ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
# 2 K Fold accurancy score 0.9667 
#Training data size 120 
#Testing data size 30
# 2 validation index 
[30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 54 55 56 57 58 59]
# Training index 
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20

##### Stratified K-Fold

In [27]:
import pandas as pd 

iris = load_iris()

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df["label"] = iris["target"]
# iris_df["label"] = iris.target
iris_df["label"].value_counts()

0    50
1    50
2    50
Name: label, dtype: int64

In [33]:
kfold = KFold(n_splits=3)

n_iter = 0 

for train_index, test_index in kfold.split(iris_df):
    n_iter +=1 
    label_train = iris_df.label.iloc[train_index]
    label_test = iris_df.label.iloc[test_index]
    print("="*30)
    print(f"Cross validation {n_iter}th")
    print(f"Distribution of label data for training \n{label_train.value_counts()}")
    print(f"Distribution of label data for testing \n{label_test.value_counts()}")

Cross validation 1th
Distribution of label data for training 
1    50
2    50
Name: label, dtype: int64
Distribution of label data for testing 
0    50
Name: label, dtype: int64
Cross validation 2th
Distribution of label data for training 
0    50
2    50
Name: label, dtype: int64
Distribution of label data for testing 
1    50
Name: label, dtype: int64
Cross validation 3th
Distribution of label data for training 
0    50
1    50
Name: label, dtype: int64
Distribution of label data for testing 
2    50
Name: label, dtype: int64
