In [None]:
# The normal approach to calculate the accuracy score for different models on the same dataset is this -
# splitting the data using train_test_split and then use training data to train and then testing the model out on the testing data.
# example ----------------->

In [3]:
import pandas as pd
import sklearn
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digit_data = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digit_data.data, digit_data.target, test_size=0.2)

# logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(" logistic regression accuracy : ", lr.score(X_test, y_test))

# SVM
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
print(" \nSVM accuracy : ", svc.score(X_test, y_test))

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(" \nRandom Forest accuracy : ", rf.score(X_test, y_test))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 logistic regression accuracy :  0.9666666666666667
 
SVM accuracy :  0.9805555555555555
 
Random Forest accuracy :  0.9722222222222222


<h2>The problem with this is that, we get different accuracy for each model, everytime we run the code as the training and testing data changes, so then the model's accuracy also changes.<br>
So the training and testing data is not uniform.<br>
To overcome this, lets use K-Fold Cross Validation<h2>

In [5]:
# demo example for k fold on array

from sklearn.model_selection import KFold
kf = KFold(n_splits=4)

for train_index, test_index in kf.split([1,2,3,4,5,6,7,8]):
    print(train_index, test_index)

[2 3 4 5 6 7] [0 1]
[0 1 4 5 6 7] [2 3]
[0 1 2 3 6 7] [4 5]
[0 1 2 3 4 5] [6 7]


this K-Fold has splitted the data into 4 splits (as instructed) into training and testing data,
So this is better technique to use in place of train-test-split
Lets try this on our digits dataset

In [6]:
from sklearn.model_selection import StratifiedKFold
# stratified kfold splits the data more uniformly

fold = StratifiedKFold(n_splits=10)

In [14]:
# now splitting the data
# the thing to notice is that, after 10 splits, we would get 10 result from each model, so we have to get the average acuuracy, so we would just take
#  the mean of the 10 split results

# firstly create a function to get the score of the model
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


# lets create empty array to store the indivisual model's result
LR_score = []
SVM_score = []
RF_score = []

for train_index, test_index in fold.split(digit_data.data, digit_data.target):
    X_train, X_test, y_train, y_test = digit_data.data[train_index], digit_data.data[test_index], digit_data.target[train_index], digit_data.target[test_index]

    LR_score.append(get_score(lr, X_train, X_test, y_train, y_test)) # logistic regression
    SVM_score.append(get_score(svc, X_train, X_test, y_train, y_test)) # SVM
    RF_score.append(get_score(rf, X_train, X_test, y_train, y_test)) # random forests

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
import numpy as np

print("logistic regression accuracy mean : ", np.array(LR_score).mean())
print("SVM accuracy mean : ", np.array(SVM_score).mean())
print("Random Forest accuracy mean : ", np.array(RF_score).mean())


logistic regression accuracy mean :  0.928193668528864
SVM accuracy mean :  0.9699503414028554
Random Forest accuracy mean :  0.9526877715704531


In [25]:
# there is an easier way to all of this
# using the cross val score in sklearn

from sklearn.model_selection import cross_val_score

lr_score = cross_val_score(LogisticRegression(), digit_data.data, digit_data.target, cv=8 ) # cv is the number of folds
svm_score = cross_val_score(SVC(), digit_data.data, digit_data.target, cv=8 ) 
rf_score = cross_val_score(RandomForestClassifier(n_estimators= 10), digit_data.data, digit_data.target, cv=8 ) 

# printing the scores
print("logistic regression accuracy mean : ", lr_score.mean())
print("SVM accuracy mean : ", svm_score.mean())
print("Random Forest accuracy mean : ", rf_score.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic regression accuracy mean :  0.92875
SVM accuracy mean :  0.9682688492063491
Random Forest accuracy mean :  0.9154092261904762
