# Classification of MNIST dataset digits
## Labels = 10
## Training Samples =  60,000
## Testing Samples =   10,000

In [1]:
# Modules
from PIL import Image
import dhash
import numpy as np
import scipy
from keras.datasets import mnist
import sklearn.preprocessing as preproc
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline
np.random.seed(123)  # for reproducibility


Using Theano backend.


## 2. Load MNIST data from keras

In [2]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [3]:
X_train.shape

(60000, 28, 28)

In [4]:
# Label encoder for future use
le = preproc.LabelEncoder()
labels = [0,1,2,3,4,5,6,7,8,9,'a','b','c','d','e','f']
le.fit(labels)

LabelEncoder()

## 3. Image Hashing on rows and columns  for training set

In [5]:
%%time
# size of image and output format
size = 8
format_ = '0' + str(size**2) + 'b'
X = X_train

#preallocate
X_train_hashed = np.zeros((X.shape[0], size**2 * 2));

for idx , Img in enumerate(X):
    row, col = dhash.dhash_row_col( Image.fromarray(Img) , size = size)
    hash_ = format(row, format_) + format(col, format_)
    # hash_ is string
    for colidx,num in enumerate(hash_):
        X_train_hashed[idx,colidx] = int(num)

CPU times: user 11.6 s, sys: 70.8 ms, total: 11.6 s
Wall time: 11.6 s


## 4. Image Hashing on rows and columns  for training set

In [6]:
%%time

X = X_test

X_test_hashed = np.zeros((X.shape[0], size**2 * 2));

for idx , Img in enumerate(X):
    row, col = dhash.dhash_row_col( Image.fromarray(Img) , size = size)
    hash_ = format(row, format_) + format(col, format_)
    
    # hash_ is string
    for col,num in enumerate(hash_):
        X_test_hashed[idx,col] = int(num)

CPU times: user 1.86 s, sys: 10.6 ms, total: 1.87 s
Wall time: 1.87 s


## 5. Classification Tree

In [7]:
%%time 
from sklearn import tree
from sklearn import metrics
# cols and rows
clasifier = tree.DecisionTreeClassifier()
clasifier = clasifier.fit(X_train_hashed, Y_train)
y_pred = clasifier.predict(X_test_hashed)


CPU times: user 1.87 s, sys: 32.5 ms, total: 1.9 s
Wall time: 1.91 s


In [8]:
print("Classification Report:\n%s" % metrics.classification_report(Y_test, y_pred))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred))

Classification Report:
             precision    recall  f1-score   support

          0       0.92      0.91      0.92       980
          1       0.95      0.96      0.95      1135
          2       0.84      0.81      0.83      1032
          3       0.79      0.80      0.80      1010
          4       0.83      0.84      0.83       982
          5       0.79      0.79      0.79       892
          6       0.89      0.87      0.88       958
          7       0.85      0.85      0.85      1028
          8       0.76      0.75      0.76       974
          9       0.80      0.82      0.81      1009

avg / total       0.84      0.84      0.84     10000

Cohen kappa Score:
0.825149360018
Hamming Loss Score:
0.1573


## 6. Random Tree

In [9]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=10)
RFC = RFC.fit(X_train_hashed, Y_train)
y_pred = RFC.predict(X_test_hashed)

In [10]:
Yreal = Y_test
print("Classification Report:\n%s" % metrics.classification_report(Yreal, y_pred))

Classification Report:
             precision    recall  f1-score   support

          0       0.92      0.98      0.95       980
          1       0.97      0.98      0.98      1135
          2       0.91      0.93      0.92      1032
          3       0.88      0.91      0.89      1010
          4       0.92      0.91      0.91       982
          5       0.91      0.88      0.89       892
          6       0.95      0.95      0.95       958
          7       0.94      0.92      0.93      1028
          8       0.90      0.86      0.88       974
          9       0.92      0.88      0.90      1009

avg / total       0.92      0.92      0.92     10000



In [16]:
param_grid = {'n_estimators':sp_randint(2, 10)}
RFC = RandomForestClassifier()
clf = RandomizedSearchCV(RFC, param_grid, n_jobs=-1, verbose=1, cv = 3, n_iter = 100)
clf.fit(X_train_hashed, Y_train);

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   58.6s finished


In [17]:
y_pred = clf.predict(X_test_hashed)
print("Classification Report:\n%s" % metrics.classification_report(Y_test, y_pred))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred))

Classification Report:
             precision    recall  f1-score   support

          0       0.91      0.97      0.94       980
          1       0.97      0.98      0.98      1135
          2       0.90      0.93      0.91      1032
          3       0.88      0.92      0.90      1010
          4       0.91      0.91      0.91       982
          5       0.90      0.89      0.89       892
          6       0.95      0.94      0.94       958
          7       0.94      0.91      0.92      1028
          8       0.92      0.84      0.88       974
          9       0.92      0.89      0.90      1009

avg / total       0.92      0.92      0.92     10000

Cohen kappa Score:
0.910293193524
Hamming Loss Score:
0.0807
