# Stratified K-Fold Cross-Validation

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Loading the Breast Cancer Wisconsin dataset

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',
header=None
)

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)



kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
scores = []



pipeline = make_pipeline(StandardScaler(), 
                         PCA(n_components=2),
                         LogisticRegression())

for k, (train, test) in enumerate(kfold):
    pipeline.fit(X_train[train], y_train[train])
    score = pipeline.score(X_train[test], y_train[test])
    scores.append(score)
    print(f'Fold {k+1}, Class distribution: {np.bincount(y_train[train])} Acc: {score}')
    
mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f'Mean accuracy score: {mean_acc:.3f} +/- {std_acc:.3f}')

Fold 1, Class distribution: [256 153] Acc: 0.9347826086956522
Fold 2, Class distribution: [256 153] Acc: 0.9347826086956522
Fold 3, Class distribution: [256 153] Acc: 0.9565217391304348
Fold 4, Class distribution: [256 153] Acc: 0.9565217391304348
Fold 5, Class distribution: [256 153] Acc: 0.9347826086956522
Fold 6, Class distribution: [257 153] Acc: 0.9555555555555556
Fold 7, Class distribution: [257 153] Acc: 0.9777777777777777
Fold 8, Class distribution: [257 153] Acc: 0.9333333333333333
Fold 9, Class distribution: [257 153] Acc: 0.9555555555555556
Fold 10, Class distribution: [257 153] Acc: 0.9555555555555556
Mean accuracy score: 0.950 +/- 0.014


# Stratified K-Fold Cross-Validation with Scikit-Learn

In [9]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipeline, X_train, y_train, cv=10, n_jobs=1)

print(f'CV accuracy scores: {scores}')

print(f'Mean accuracy score: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]
Mean accuracy score: 0.950 +/- 0.014
