In this notebook I've tested and compared some classification algorithms in task of handwritten digits classifying.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Import of ML algorithms

In [2]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
from sklearn import model_selection

In [4]:
df = pd.read_csv("Data\\train.csv")

In [10]:
print(f"Shape: {df.shape}")
df.head(10)

Shape: (42000, 785)


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
seed = 0
X = df.iloc[:, 1:].copy().values
y = df['label'].copy().values

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.2, random_state=seed)


### Models comparison

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
svc.score(X_val, y_val)

0.9747619047619047

In [None]:
models = [
    SVC(),
    KNeighborsClassifier(),
    GaussianNB(),
    DecisionTreeClassifier()]

results = []
names = ['SVC', 'KNN', 'NB', 'TREE']
scoring = 'accuracy'
for i, model in enumerate(models):
	cv_results = model_selection.cross_val_score(model, X_train, y_train, scoring=scoring)
	results.append(cv_results)
	msg = f"{names[i]}: Mean = {cv_results.mean()}. STD Dev = {cv_results.std()}"
	print(msg)


SVC: Mean = 0.9732440476190476. STD Dev = 0.001992936790807538
KNN: Mean = 0.9628869047619049. STD Dev = 0.003036006055072995
NB: Mean = 0.5647321428571429. STD Dev = 0.009514038886878858
TREE: Mean = 0.8467261904761905. STD Dev = 0.004935453557076489


SVC and KNN alghorithms seem ok for further investigating

### SVC hyperparameters tuning

In [22]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = model_selection.GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train[:5000], y_train[:5000])


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   5.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   6.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   5.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   5.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   6.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.6s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   3.5s
[CV] END .....................C=0.1, gamma=1, k

In [23]:
print(grid.best_score_)
print(grid.best_params_)

0.9352
{'C': 0.1, 'gamma': 1, 'kernel': 'poly'}


In [24]:
svc = SVC()
svc.set_params(**grid.best_params_)

svc.fit(X_train, y_train)

train_score = svc.score(X_train, y_train)
val_score = svc.score(X_val, y_val)

print(f"SVC model. Train score: {train_score}. Validation score = {val_score}")

SVC model. Train score: 1.0. Validation score = 0.974047619047619


In [28]:
from sklearn.metrics import confusion_matrix
y_pred = svc.predict(X_val)
val_score = svc.score(X_val, y_val)
confusion_matrix(y_val, y_pred)

array([[797,   0,   3,   0,   1,   2,   5,   0,   5,   0],
       [  0, 954,   3,   2,   0,   0,   0,   1,   1,   0],
       [  5,   1, 841,   5,   1,   1,   0,   3,   3,   0],
       [  0,   0,  11, 826,   0,  11,   0,   4,   9,   2],
       [  2,   3,   1,   0, 806,   1,   4,   2,   0,   8],
       [  1,   0,   2,   5,   0, 732,   8,   0,   7,   1],
       [  1,   0,   0,   0,   2,   3, 834,   0,   1,   0],
       [  0,   4,   5,   1,   1,   1,   0, 880,   2,   5],
       [  2,   5,   2,   9,   2,   3,   3,   1, 739,   2],
       [  6,   5,   1,   3,   7,   2,   0,  14,   1, 773]], dtype=int64)

### KNN

In [29]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15], 'weights' : ['uniform','distance'], 'metric' : ['minkowski','euclidean','manhattan']}
grid = model_selection.GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 2, refit=True)
grid.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   3.7s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   3.6s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   3.5s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   3.6s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   3.7s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   3.5s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   3.5s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   3.4s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   3.4s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   3.5s
[CV] END ...metric=minkowski, n_neighbors=7, weights=uniform; total time=   3.5s
[CV] END ...metric=minkowski, n_neighbors=7, we

In [30]:
print(grid.best_score_)
print(grid.best_params_)

0.9644047619047619
{'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}


In [31]:
knn = KNeighborsClassifier()
knn.set_params(**grid.best_params_)

knn.fit(X_train, y_train)

train_score = knn.score(X_train, y_train)
val_score = knn.score(X_val, y_val)

print(f"KNN model. Train score: {train_score}. Validation score = {val_score}")

KNN model. Train score: 1.0. Validation score = 0.9703571428571428


In [32]:
from sklearn.metrics import confusion_matrix
y_pred = knn.predict(X_val)
val_score = knn.score(X_val, y_val)
confusion_matrix(y_val, y_pred)

array([[804,   0,   2,   0,   0,   2,   4,   0,   1,   0],
       [  0, 956,   2,   1,   0,   0,   0,   1,   1,   0],
       [  9,   5, 829,   3,   0,   0,   1,  10,   3,   0],
       [  1,   0,   3, 836,   0,   6,   1,   6,   5,   5],
       [  1,   7,   0,   0, 794,   0,   4,   1,   0,  20],
       [  2,   0,   0,   8,   0, 736,   7,   0,   1,   2],
       [  0,   1,   0,   0,   1,   2, 837,   0,   0,   0],
       [  0,   9,   4,   0,   1,   0,   0, 878,   0,   7],
       [  2,   7,   0,  10,   3,  17,   6,   2, 712,   9],
       [  4,   2,   1,   3,   9,   4,   2,  18,   0, 769]], dtype=int64)

### Sources
https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

https://towardsdatascience.com/the-best-machine-learning-algorithm-for-handwritten-digits-recognition-2c6089ad8f09

https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/