## K-fold Cross Validation

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits

In [2]:
digits = load_digits()

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.3,random_state=52)

### Training different models simply on training dataset

In [4]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9648148148148148

In [5]:
svm = SVC()
svm.fit(X_train,y_train)
svm.score(X_test,y_test)

0.9888888888888889

In [6]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test,y_test)

0.975925925925926

### Using K-fold

In [7]:
from sklearn.model_selection import KFold

In [8]:
kf = KFold(n_splits=5)
kf

KFold(n_splits=5, random_state=None, shuffle=False)

##### what actually we are trying to do 

In [9]:
for train_index, test_index in kf.split(np.arange(100,115)):
    print(train_index,test_index)
# notice that it splits indices and not the values of the dataset provided.

[ 3  4  5  6  7  8  9 10 11 12 13 14] [0 1 2]
[ 0  1  2  6  7  8  9 10 11 12 13 14] [3 4 5]
[ 0  1  2  3  4  5  9 10 11 12 13 14] [6 7 8]
[ 0  1  2  3  4  5  6  7  8 12 13 14] [ 9 10 11]
[ 0  1  2  3  4  5  6  7  8  9 10 11] [12 13 14]


In [10]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

#### StratifiedKFold
StratifiedKFold takes the cross validation one step further. The class distribution in the dataset is preserved in the training and test splits.

In [11]:
from sklearn.model_selection import StratifiedKFold

In [12]:
skf = StratifiedKFold(n_splits=5)
skf

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [13]:
scores_lr = []
scores_svm = []
scores_rf = []

for train_index, test_index in kf.split(digits.data):
    X_train,X_test,y_train,y_test = digits.data[train_index],digits.data[test_index],digits.target[train_index],digits.target[test_index]
    
    scores_lr.append(get_score(lr,X_train,X_test,y_train,y_test))
    scores_svm.append(get_score(svm,X_train,X_test,y_train,y_test))    
    scores_rf.append(get_score(rf,X_train,X_test,y_train,y_test))  

In [14]:
print('LogisticRegression =', scores_lr)
print('\nSVC =',scores_svm)
print('\nRandomForestClassifier =',scores_lr)

LogisticRegression = [0.9305555555555556, 0.8777777777777778, 0.9415041782729805, 0.9387186629526463, 0.8997214484679665]

SVC = [0.9694444444444444, 0.9472222222222222, 0.9832869080779945, 0.9888579387186629, 0.9415041782729805]

RandomForestClassifier = [0.9305555555555556, 0.8777777777777778, 0.9415041782729805, 0.9387186629526463, 0.8997214484679665]


In [15]:
print('LogisticRegression =', np.mean(scores_lr))
print('\nSVC =',np.mean(scores_svm))
print('\nRandomForestClassifier =',np.mean(scores_lr))

LogisticRegression = 0.9176555246053854

SVC = 0.9660631383472609

RandomForestClassifier = 0.9176555246053854


#### Now doing the above work with bulit-in library 'cross_val_score' in sklearn

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
cross_val_score(lr,digits.data,digits.target)

array([0.925     , 0.87777778, 0.93871866, 0.93314763, 0.89693593])

In [18]:
print('LogisticRegression =', np.mean(cross_val_score(lr,digits.data,digits.target)))
print('\nSVC =',np.mean(cross_val_score(svm,digits.data,digits.target)))
print('\nRandomForestClassifier =',np.mean(cross_val_score(rf,digits.data,digits.target)))

LogisticRegression = 0.9143160012380068

SVC = 0.9632838130609718

RandomForestClassifier = 0.9382466728567007
