In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size = 0.3)

In [14]:
len(X_train)

1257

In [15]:
len(X_test)

540

In [16]:
lr = LogisticRegression(solver='lbfgs', max_iter=10000)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9703703703703703

In [17]:
svm = SVC()
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.9907407407407407

In [18]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9833333333333333

<b>The problem with train_test_split is that it may not divide data properly and hence is not very consistent </b>

In [19]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [21]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


<b> We can see that each time it splits the data differently for training and testing the model. Hence we can use mupltiple kind of training and testing samples to make sure there is no bias towards certain kind of data and increase randomness </b>

In [22]:
 def get_score(model, X_train, X_test, y_train, y_test):
        model.fit(X_train, y_train)
        return model.score(X_test, y_test)

<b> KFold is a cross-validator that divides the dataset into k folds. Stratified is to ensure that each fold of dataset has the same proportion of observations with a given label. </b>

In [24]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [32]:
scores_lr = []
scores_svm = []
scores_rf = []

for train_index, test_index in kf.split(digits.data):
    X_train,X_test,y_train,y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    scores_lr.append(get_score(LogisticRegression(max_iter=10000), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

In [33]:
scores_lr

[0.9282136894824707, 0.9415692821368948, 0.9165275459098498]

In [34]:
scores_svm

[0.9666110183639399, 0.9816360601001669, 0.9549248747913188]

In [35]:
scores_rf

[0.9382303839732888, 0.9449081803005008, 0.9165275459098498]

<b> SciKitLearn library offers us methods to do all this with lesser amount of code </b>

In [36]:
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(max_iter=10000),digits.data,digits.target)

array([0.925     , 0.87777778, 0.93871866, 0.93314763, 0.89693593])