# Classification of MNIST dataset digits
## Labels = 10
## Training Samples =  60,000
## Testing Samples =   10,000

In [10]:
# Modules
from PIL import Image
import dhash
import numpy as np
import scipy
from keras.datasets import mnist
import sklearn.preprocessing as preproc
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from keras.utils import to_categorical 


%matplotlib inline
np.random.seed(123)  # for reproducibility


## 2. Load MNIST data from keras

In [27]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [49]:
num_classes = 10
# convert class vectors to binary class matrices
Y_train_one_hot = to_categorical(Y_train, num_classes)
Y_test_one_hot = to_categorical(Y_test, num_classes)

def one_hot_to_class(categorical_matrix):
    ypred_number = np.zeros( ( categorical_matrix.shape[0], 1))
    for idx, binary_code in enumerate(categorical_matrix):
        ypred_number[idx,0] = np.argmax(binary_code)
    return ypred_number
    

## 3. Image Hashing on rows and columns  for training set

In [5]:
%%time
# size of image and output format
size = 8
format_ = '0' + str(size**2) + 'b'
X = X_train

#preallocate
X_train_hashed = np.zeros((X.shape[0], size**2 * 2));

for idx , Img in enumerate(X):
    row, col = dhash.dhash_row_col( Image.fromarray(Img) , size = size)
    hash_ = format(row, format_) + format(col, format_)
    # hash_ is string
    for colidx,num in enumerate(hash_):
        X_train_hashed[idx,colidx] = int(num)

CPU times: user 11 s, sys: 47.7 ms, total: 11 s
Wall time: 11 s


## 4. Image Hashing on rows and columns  for training set

In [6]:
%%time

X = X_test

X_test_hashed = np.zeros((X.shape[0], size**2 * 2));

for idx , Img in enumerate(X):
    row, col = dhash.dhash_row_col( Image.fromarray(Img) , size = size)
    hash_ = format(row, format_) + format(col, format_)
    
    # hash_ is string
    for col,num in enumerate(hash_):
        X_test_hashed[idx,col] = int(num)

CPU times: user 1.89 s, sys: 12.6 ms, total: 1.91 s
Wall time: 1.9 s


## 5. Classification Tree

In [50]:
%%time 
from sklearn import tree
from sklearn import metrics
# cols and rows
clasifier = tree.DecisionTreeClassifier()
clasifier = clasifier.fit(X_train_hashed, Y_train_one_hot)
y_pred = clasifier.predict(X_test_hashed)
y_pred2 = one_hot_to_class(y_pred)

CPU times: user 3.43 s, sys: 22.7 ms, total: 3.45 s
Wall time: 3.45 s


In [52]:

print("Classification Report:\n%s" % metrics.classification_report(Y_test_one_hot, y_pred))
#print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred))
#print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred))

Classification Report:
             precision    recall  f1-score   support

          0       0.91      0.92      0.91       980
          1       0.95      0.97      0.96      1135
          2       0.83      0.82      0.82      1032
          3       0.80      0.81      0.80      1010
          4       0.83      0.84      0.84       982
          5       0.81      0.79      0.80       892
          6       0.90      0.86      0.88       958
          7       0.86      0.85      0.85      1028
          8       0.76      0.76      0.76       974
          9       0.79      0.82      0.80      1009

avg / total       0.84      0.84      0.84     10000



In [54]:
# using original encoding for the output
print("Classification Report:\n%s" % metrics.classification_report(Y_test, y_pred2))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred2))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred2))

Classification Report:
             precision    recall  f1-score   support

          0       0.91      0.92      0.91       980
          1       0.95      0.97      0.96      1135
          2       0.83      0.82      0.82      1032
          3       0.80      0.81      0.80      1010
          4       0.83      0.84      0.84       982
          5       0.81      0.79      0.80       892
          6       0.90      0.86      0.88       958
          7       0.86      0.85      0.85      1028
          8       0.76      0.76      0.76       974
          9       0.79      0.82      0.80      1009

avg / total       0.84      0.84      0.84     10000

Cohen kappa Score:
0.827032057074
Hamming Loss Score:
0.1556


## 6. Random Tree

In [55]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=10)
RFC = RFC.fit(X_train_hashed, Y_train_one_hot)
y_pred = RFC.predict(X_test_hashed)

In [57]:
Yreal = Y_test
Ypred = one_hot_to_class(y_pred)
print("Classification Report:\n%s" % metrics.classification_report(Y_test, Ypred))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, Ypred))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, Ypred))

Classification Report:
             precision    recall  f1-score   support

          0       0.37      1.00      0.54       980
          1       1.00      0.96      0.98      1135
          2       0.98      0.80      0.88      1032
          3       0.98      0.78      0.87      1010
          4       0.99      0.76      0.86       982
          5       0.98      0.74      0.84       892
          6       0.99      0.88      0.93       958
          7       0.98      0.81      0.89      1028
          8       0.98      0.69      0.81       974
          9       0.97      0.76      0.85      1009

avg / total       0.92      0.82      0.85     10000

Cohen kappa Score:
0.799953495856
Hamming Loss Score:
0.18


In [71]:
prob = RFC.predict_proba(X_test_hashed)
prob[3].shape

(10000, 2)

In [42]:
ypred_number

array([[ 7.],
       [ 0.],
       [ 1.],
       ..., 
       [ 4.],
       [ 0.],
       [ 6.]])

In [None]:
y_pred.shape

In [None]:
param_grid = {'n_estimators':sp_randint(2, X_train_hashed.shape[1])}
RFC = RandomForestClassifier()
clf = RandomizedSearchCV(RFC, param_grid, n_jobs=-1, verbose=1, cv = 2)
clf.fit(X_train_hashed, Y_train_hot)

In [None]:
y_pred = clf.predict(X_test_hashed)
print("Classification Report:\n%s" % metrics.classification_report(Y_test, y_pred))
print("Cohen kappa Score:\n%s" % metrics.cohen_kappa_score(Y_test, y_pred))
print("Hamming Loss Score:\n%s" % metrics.hamming_loss(Y_test, y_pred))