# Classification of MNIST dataset digits
## Labels = 10
## Training Samples =  60,000
## Testing Samples =   10,000

In [1]:
# Modules
from PIL import Image
import dhash
import numpy as np
import scipy
from keras.datasets import mnist
import sklearn.preprocessing as preproc
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline
np.random.seed(123)  # for reproducibility


Using Theano backend.


## 2. Load MNIST data from keras

In [2]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [3]:
X_train.shape

(60000, 28, 28)

In [4]:
# Label encoder for future use
le = preproc.LabelEncoder()
labels = [0,1,2,3,4,5,6,7,8,9,'a','b','c','d','e','f']
le.fit(labels)

LabelEncoder()

## 3. Image Hashing on rows and columns  for training set

In [5]:
%%time
# size of image and output format
size = 8
format_ = '0' + str(size**2) + 'b'
X = X_train

#preallocate
X_train_hashed = np.zeros((X.shape[0], size**2 * 2));

for idx , Img in enumerate(X):
    row, col = dhash.dhash_row_col( Image.fromarray(Img) , size = size)
    hash_ = format(row, format_) + format(col, format_)
    # hash_ is string
    for colidx,num in enumerate(hash_):
        X_train_hashed[idx,colidx] = int(num)

CPU times: user 14.6 s, sys: 135 ms, total: 14.7 s
Wall time: 16.3 s


## 4. Image Hashing on rows and columns  for training set

In [6]:
%%time

X = X_test

X_test_hashed = np.zeros((X.shape[0], size**2 * 2));

for idx , Img in enumerate(X):
    row, col = dhash.dhash_row_col( Image.fromarray(Img) , size = size)
    hash_ = format(row, format_) + format(col, format_)
    
    # hash_ is string
    for col,num in enumerate(hash_):
        X_test_hashed[idx,col] = int(num)

CPU times: user 2.02 s, sys: 15 ms, total: 2.03 s
Wall time: 2.03 s


## 5. Classification Tree

In [7]:
%%time 
from sklearn import tree
from sklearn import metrics
# cols and rows
clasifier = tree.DecisionTreeClassifier()
clasifier = clasifier.fit(X_train_hashed, Y_train)
y_pred = clasifier.predict(X_test_hashed)


CPU times: user 1.94 s, sys: 35.5 ms, total: 1.97 s
Wall time: 1.99 s


In [8]:
print("Classification Report:\n%s" % metrics.classification_report(Y_test, y_pred))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred))

Classification Report:
             precision    recall  f1-score   support

          0       0.92      0.91      0.92       980
          1       0.95      0.96      0.95      1135
          2       0.84      0.81      0.83      1032
          3       0.79      0.80      0.80      1010
          4       0.83      0.84      0.83       982
          5       0.79      0.79      0.79       892
          6       0.89      0.87      0.88       958
          7       0.85      0.85      0.85      1028
          8       0.76      0.75      0.76       974
          9       0.80      0.82      0.81      1009

avg / total       0.84      0.84      0.84     10000

Cohen kappa Score:
0.825149360018
Hamming Loss Score:
0.1573


In [9]:
from keras.utils import to_categorical 
num_classes = 10
# convert class vectors to binary class matrices
Y_train_hot = to_categorical(Y_train, num_classes)
Y_test_hot = to_categorical(Y_test, num_classes)

## 6. Random Tree

In [10]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=10)
RFC = RFC.fit(X_train_hashed, Y_train_hot)
y_pred = RFC.predict(X_test_hashed)

In [11]:
Yreal = Y_test_hot
print("Classification Report:\n%s" % metrics.classification_report(Yreal, y_pred))

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.92      0.96       980
          1       0.99      0.95      0.97      1135
          2       0.98      0.79      0.88      1032
          3       0.99      0.78      0.87      1010
          4       0.99      0.77      0.86       982
          5       0.98      0.74      0.84       892
          6       0.99      0.87      0.93       958
          7       0.98      0.83      0.90      1028
          8       0.98      0.67      0.80       974
          9       0.97      0.77      0.86      1009

avg / total       0.98      0.81      0.89     10000



In [14]:
param_grid = {'n_estimators':sp_randint(2, 10)}
RFC = RandomForestClassifier()
clf = RandomizedSearchCV(RFC, param_grid, n_jobs=-1, verbose=1, cv = 3)
clf.fit(X_train_hashed, Y_train_hot)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   15.5s finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10899a400>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [15]:
y_pred = clf.predict(X_test_hashed)
print("Classification Report:\n%s" % metrics.classification_report(Y_test, y_pred))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred))

ValueError: Mix type of y not allowed, got types {'multiclass', 'multilabel-indicator'}