In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
#Sklearn module has some pre-determined datasets inside the module.
#So, instead of cleaning and using other datasets, we are using some in-built datasets for practise.
from sklearn.datasets import load_digits
digits = load_digits()

In [5]:
from sklearn.model_selection import train_test_split
x = digits.data
y = digits.target
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [6]:
#Next, we can import the cross_val_score module for cross validation of data
#Also,import KNeighborsClassifier module to implement Nearest Neighbour Classification.
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [7]:
#Create an object of KNeighborsClassifier as-
k = KNeighborsClassifier()

In [10]:
#We calculate Cross-Validation score as-      cross_val_score(k,x_train,y_train,cv=5)
#The cross_val_score() method takes 4 parameters as- KNearestNeighborsClassifier object(k),
#                                                  - x_train,y_train (i.e.training dataset),
#                                                  - cv parameter(represents number of folds during cross validation)

cross_val_score(k,x_train,y_train,cv=5)

array([0.99652778, 0.97916667, 0.97560976, 0.98954704, 0.97212544])

In [11]:
#As we can see, the cross validation scores for my dataset goes from around 0.9721 to 0.9965
#This is pretty great (97% to 99% accuracy)

In [12]:
#There are specific modules to use in order to select my cv parameter for the best accuracy
#The 2 modules we can use for selecting our cv are- 'KFold' and 'RepeatedStratifiedKFold'
from sklearn.model_selection import KFold,RepeatedStratifiedKFold

In [13]:
#KFold takes parameters as n_splits=10,shuffle=True,random_state=42
cross_val_score(k,x_train,y_train,cv=KFold(n_splits=10,shuffle=True,random_state=42))

array([0.97916667, 0.99305556, 0.98611111, 0.98611111, 0.97916667,
       0.98611111, 0.99305556, 0.99300699, 0.95104895, 0.98601399])

In [14]:
#Using KFold increased my efficiency as 0.9791 to 0.9930

In [15]:
#RepeatedStratifiedKFold takes parameters as n_splits=10,n_repeats=10,random_state=42
cross_val_score(k,x_train,y_train,cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=10,random_state=42))

array([0.98611111, 0.98611111, 0.97916667, 0.98611111, 0.99305556,
       0.97222222, 1.        , 0.98601399, 0.98601399, 0.97902098,
       0.97916667, 0.97916667, 0.99305556, 0.98611111, 0.99305556,
       0.97916667, 0.99305556, 0.96503497, 1.        , 1.        ,
       0.97916667, 0.96527778, 0.99305556, 0.99305556, 0.97222222,
       1.        , 0.97916667, 1.        , 0.98601399, 0.99300699,
       0.97916667, 0.98611111, 0.99305556, 0.98611111, 0.99305556,
       0.98611111, 1.        , 0.97902098, 0.97902098, 0.97902098,
       1.        , 0.97916667, 0.99305556, 0.98611111, 0.98611111,
       0.97916667, 0.97916667, 0.98601399, 0.97202797, 0.98601399,
       0.98611111, 0.98611111, 0.97222222, 0.97916667, 1.        ,
       0.98611111, 0.97916667, 0.99300699, 0.96503497, 1.        ,
       0.99305556, 0.97916667, 0.99305556, 0.97916667, 0.99305556,
       0.97916667, 0.99305556, 0.98601399, 0.99300699, 0.97202797,
       1.        , 0.97916667, 0.99305556, 0.99305556, 1.     

In [16]:
#As we can see RepeatedStratifiedKFold has increased my accuracy to a greatpoint such that we could see few 1 above
#Which means 100% accuracy.

In [17]:
#Now let's see the SVM module for Support Vector Machines(another way to do multiclassification algorithm)
#Also we'd look at Grid Searches using GridSearchCV module that is used to do automatic efficient cross-validation
