In [1]:
# It helps in selecting the best model and hyperparameters while minimizing overfitting.
# Cross-validation is commonly used in techniques like Grid Search or Random Search to find the best hyperparameters for a model
# Cross-validation helps compare multiple models and choose the one with the best generalization ability.
# Preventing Overfitting & Underfitting

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.datasets import load_digits

digits = load_digits()

In [78]:
from sklearn.model_selection import train_test_split

# you can also create dataframe(df) and then into X,y for your visibilty
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target, test_size=.30)

### Logistic Regression



In [81]:
lr = LogisticRegression() ## giving error as: TOTAL NO. of ITERATIONS REACHED LIMIT.
lr = LogisticRegression(solver='saga',max_iter=1000)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9666666666666667

### SVM

In [84]:
svm = SVC()
svm.fit(X_train,y_train)
svm.score(X_test,y_test)

0.9888888888888889

### Random Forest



In [89]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.9740740740740741

# KFold cross validation

Basic example


In [92]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [94]:
for train_index,test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [96]:
def get_score(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

In [98]:
# get_score(LogisticRegression(solver='saga',max_iter=1000),X_train,X_test,y_train,y_test)
# get_score(SVC(),X_train,X_test,y_train,y_test)
get_score(RandomForestClassifier(),X_train,X_test,y_train,y_test)

0.9703703703703703

In [100]:
from sklearn.model_selection import StratifiedKFold
folds =StratifiedKFold(n_splits=3)

In [130]:
'''

score_l =[]
score_svm =[]
score_rf =[]

for train_index, test_index in kf.split(digits.data):
    X_train,X_test,y_train,y_test = digits.data[train_index],digits.data[test_index], \
                                    digits.target[train_index], digits.target[test_index]
    score_l.append(get_score(LogisticRegression(solver='saga',max_iter=1000),X_train,X_test,y_train,y_test))
    score_svm.append(get_score(SVC(),X_train,X_test,y_train,y_test))
    score_rf.append(get_score(RandomForestClassifier(),X_train,X_test,y_train,y_test))

'''

"\nscore_l =[]\nscore_svm =[]\nscore_rf =[]\n\nfor train_index, test_index in kf.split(digits.data):\n    X_train,X_test,y_train,y_test = digits.data[train_index],digits.data[test_index],                                     digits.target[train_index], digits.target[test_index]\n    score_l.append(get_score(LogisticRegression(solver='saga',max_iter=1000),X_train,X_test,y_train,y_test))\n    score_svm.append(get_score(SVC(),X_train,X_test,y_train,y_test))\n    score_rf.append(get_score(RandomForestClassifier(),X_train,X_test,y_train,y_test))\n"

In [124]:
score_l

[0.9248747913188647, 0.9432387312186978, 0.9148580968280468]

In [118]:
score_svm

[0.9666110183639399, 0.9816360601001669, 0.9549248747913188]

In [120]:
score_rf

[0.9398998330550918, 0.9549248747913188, 0.9131886477462438]

# cross_val_score function

In [166]:
# you can useinbuild function for above code.

from sklearn.model_selection import cross_val_score
import numpy as np

In [139]:
cross_val_score(SVC(),digits.data,digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [156]:
cross_val_score(RandomForestClassifier(),digits.data,digits.target)


array([0.93611111, 0.91666667, 0.95821727, 0.95821727, 0.91364903])

# Parameter tunning using k fold cross validation

In [168]:
score_1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data,digits.target)
np.average(score_1)

0.8575796966883319

In [170]:
score_2 = cross_val_score(RandomForestClassifier( n_estimators = 20),digits.data,digits.target)
np.average(score_2)

0.9232358402971215

In [172]:
score_3 = cross_val_score(RandomForestClassifier( n_estimators = 40),digits.data,digits.target)
np.average(score_3)

0.9376880222841226