In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits

In [2]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [3]:
X = digits['data']
y = digits['target']
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3)

In [4]:
lr = LogisticRegression(max_iter=5000).fit(X_train,y_train)
lr.score(X_test,y_test)

0.9629629629629629

In [5]:
svm = SVC().fit(X_train,y_train)
svm.score(X_test,y_test)

0.987037037037037

In [6]:
rf = RandomForestClassifier(n_estimators=40).fit(X_train,y_train)
rf.score(X_test,y_test)

0.9777777777777777

In [7]:
#-------K-Fold Cross Validation----------#

In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [9]:
for train_index,test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index,test_index)
    # makes 3 splits because n_splits

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [10]:
def get_score(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

In [11]:
get_score(LogisticRegression(max_iter=5000),X_train,X_test,y_train,y_test)

0.9629629629629629

In [12]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3) # 10 is a good practice
folds

StratifiedKFold(n_splits=3, random_state=None, shuffle=False)

In [13]:
score_log = []
score_svc = []
score_rf = []
for train_index,test_index in folds.split(digits['data'],digits['target']):
    X_train , X_test,y_train,y_test = digits['data'][train_index],digits['data'][test_index],digits['target'][train_index],digits['target'][test_index]
    score_log.append(get_score(LogisticRegression(max_iter=5000),X_train,X_test,y_train,y_test))
    score_svc.append(get_score(SVC(),X_train,X_test,y_train,y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=40),X_train,X_test,y_train,y_test))    

In [14]:
score_log

[0.9248747913188647, 0.9398998330550918, 0.9232053422370617]

In [15]:
score_svc

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [16]:
score_rf

[0.9282136894824707, 0.9432387312186978, 0.9282136894824707]

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [33]:
cross_val_score(LogisticRegression(max_iter=5000),digits['data'],digits['target']) # this methods does the same as above the for loop

array([0.925     , 0.87777778, 0.93871866, 0.93593315, 0.89693593])

In [34]:
# Use iris flower dataset from sklearn library and use cross_val_score against following models to measure the performance of each. In the end figure out the model with best performance,
# 
# Logistic Regression
# SVM
# Decision Tree
# Random Forest

In [35]:
from sklearn.datasets import load_iris
iris = load_iris()
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [36]:
score_log = cross_val_score(LogisticRegression(max_iter=2000),iris['data'],iris['target'])
score_log

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [37]:
score_log.mean()
np.average(score_log)
# both same

0.9733333333333334

In [38]:
score_SVM = cross_val_score(SVC(),iris['data'],iris['target'])
score_SVM

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

In [42]:
score_tree = cross_val_score(DecisionTreeClassifier(),iris['data'],iris['target'])
score_tree

array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

In [46]:
score_rf = cross_val_score(RandomForestClassifier(n_estimators=40),iris['data'],iris['target'])
score_rf

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [47]:
print(f"Score of Logistic {score_log.mean()}")
print(f"Score of SVC {score_SVM.mean()}")
print(f"Score of Tree {score_tree.mean()}")
print(f"Score of RandomForest {score_rf.mean()}")

Score of Logistic 0.9733333333333334
Score of SVC 0.9666666666666666
Score of Tree 0.9533333333333334
Score of RandomForest 0.96
