# <span style="color:Purple">**KFold Cross Validation**</span>

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
digits=load_digits()


In [3]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [4]:
X=digits.data
y=digits.target

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [18]:
len(X_train), len(X_test)

(1257, 540)

## **Logistic Regression**

In [19]:
model_lr=LogisticRegression(solver='liblinear', max_iter=100, multi_class='ovr')
model_lr.fit(X_train, y_train)
model_lr.score(X_test, y_test)

0.9537037037037037

## **Support Vector Machine**

In [20]:
model_svm=SVC(C=1, kernel='poly', gamma='auto')
model_svm.fit(X_train, y_train)
model_svm.score(X_test, y_test)

0.9833333333333333

## **Decision Tree Classifier**

In [37]:
model_tree=DecisionTreeClassifier(criterion='gini', splitter='best')
model_tree.fit(X_train, y_train)
model_tree.score(X_test, y_test)

0.8351851851851851

## **Random Forest Classifier**

In [22]:
model_rf=RandomForestClassifier(n_estimators=60, criterion='gini')
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.9703703703703703

## <span style="color:Yellow">**KFold cross validation**</span>

Basic example

In [23]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [27]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)  

# Here the first 3 elements, then next 3 are used for testing and remaining for training, in each iteration. The iterations will be based on n_splits=3

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


### **KFold functioning**
This bascially shows how KFold works, the same results can be achieved using cross val score

In [None]:
# define a function to perform the model split and training
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [None]:
#Logistic Regression using the funtion gives similar output
get_score(LogisticRegression(solver='liblinear', max_iter=100, multi_class='ovr'), X_train, X_test, y_train, y_test)

0.9537037037037037

In [32]:
# SVM
get_score(SVC(C=1, kernel='poly', gamma='auto'), X_train, X_test, y_train, y_test)

0.9833333333333333

In [38]:
# Decision Tree
get_score(DecisionTreeClassifier(criterion='gini', splitter='best'), X_train, X_test, y_train, y_test)

0.8314814814814815

In [42]:
# Random Forest Classifier
get_score(RandomForestClassifier(n_estimators=60, criterion='gini'), X_train, X_test, y_train, y_test)

0.9740740740740741

## **Using KFold for our digits dataset**

In [55]:
from sklearn.model_selection import StratifiedKFold     #StratifiedKFold has uniform splits for the folds
folds=StratifiedKFold(n_splits=3)

scores_logistic=[]      
scores_svm=[]
scores_tree=[]
scores_rf=[]

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test=digits.data[train_index], digits.data[test_index], \
                                     digits.target[train_index], digits.target[test_index]
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear', multi_class='ovr'), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_tree.append(get_score(DecisionTreeClassifier(criterion='gini', splitter='best'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=90, criterion='gini'), X_train, X_test, y_train, y_test))

print('Logistic Regression Scores:', scores_logistic)
print('SVM scores:', scores_svm)
print('Decision Tree scores:', scores_tree)
print('Random Forest Scores:', scores_rf)


Logistic Regression Scores: [0.8948247078464107, 0.9532554257095158, 0.9098497495826378]
SVM scores: [0.3806343906510851, 0.41068447412353926, 0.5125208681135225]
Decision Tree scores: [0.7312186978297162, 0.8196994991652755, 0.7612687813021702]
Random Forest Scores: [0.9465776293823038, 0.9599332220367279, 0.9298831385642737]


## <span style="color:Green">**cross_val_score function**</span>

- The same output as the above function can be achieved using cross_val_score function

In [57]:
from sklearn.model_selection import cross_val_score

In [58]:
# Logistic Regression model performance using cross_val_score

cross_val_score(LogisticRegression(solver='liblinear', multi_class='ovr'), X, y, cv=3)

array([0.89482471, 0.95325543, 0.90984975])

In [61]:
# SVM model performance using SVM

cross_val_score(SVC(gamma='auto'), X, y, cv=3)

array([0.38063439, 0.41068447, 0.51252087])

In [62]:
# Decision Tree using cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='gini', splitter='best'), X, y, cv=3)

array([0.75626043, 0.83639399, 0.75792988])

In [64]:
# Random Forest using cross_val_score

cross_val_score(RandomForestClassifier(n_estimators=100, criterion='gini'), X, y, cv=3)

array([0.94323873, 0.95993322, 0.92821369])

### <span style="color:yellow">**Parameter tunning using k fold cross validation**</span>

In [70]:
scores1=cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=10)
np.average(scores1)

0.9493513345747981

In [71]:
scores2=cross_val_score(RandomForestClassifier(n_estimators=60), X, y, cv=5)
np.average(scores2)

0.9371355617455895

In [72]:
scores3=cross_val_score(RandomForestClassifier(n_estimators=70), X, y, cv=10)
np.average(scores3)

0.9476908752327746

In [73]:
scores4=cross_val_score(RandomForestClassifier(n_estimators=100), X, y, cv=10)
np.average(scores4)

0.9487957790192427

- Here we used cross_val_score to fine tune our random forest classifier and figured that having around 50 trees in random forest gives best result.