# Master Colloqium - Character Recognition

In [1]:
#!pip install emnist
#!python -c "import emnist; emnist.ensure_cached_data()"

In [2]:
from emnist import extract_training_samples, extract_test_samples, list_datasets
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import pickle as pk

## Data Sets

In [3]:
# Defining data sets
#digits only
dig_im_train, dig_y_train = extract_training_samples('digits')
dig_im_test, dig_y_test = extract_test_samples('digits')
#letters only
let_im_train, let_y_train = extract_training_samples('letters')
let_im_test, let_y_test = extract_test_samples('letters')

Reshape images to vectors

In [4]:
#digit vectors
dig_X_train = dig_im_train[:].reshape(dig_im_train.shape[0],dig_im_train.shape[1]**2)
dig_X_test = dig_im_test[:].reshape(dig_im_test.shape[0],dig_im_test.shape[1]**2)
print(dig_X_train.shape, dig_X_test.shape)
#letter vectors
let_X_train = let_im_train[:].reshape(let_im_train.shape[0],let_im_train.shape[1]**2)
let_X_test = let_im_test[:].reshape(let_im_test.shape[0],let_im_test.shape[1]**2)
print(let_X_train.shape, let_X_test.shape)

(240000, 784) (40000, 784)
(124800, 784) (20800, 784)


## Classifiers

In [5]:
#Multi-Class Classification
dig_MLP_clf = MLPClassifier(random_state=1, max_iter=300).fit(dig_X_train, dig_y_train)

In [None]:
# KNN plus HPO with GridSearch
pipeline = Pipeline([('clf',KNeighborsClassifier())])
# Define hyperparameter grid
param_grid = {'clf__n_neighbors':range(1,10)}
#GridSearchCV to perform HPO pipeline
dig_KNN_clf = GridSearchCV(pipeline, param_grid,verbose=3, cv = 2).fit(dig_X_train[:10000], dig_y_train[:10000])

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV] clf__n_neighbors=1 ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. clf__n_neighbors=1, score=0.933, total=  50.1s
[CV] clf__n_neighbors=1 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   50.0s remaining:    0.0s


[CV] .................. clf__n_neighbors=1, score=0.940, total=  47.6s
[CV] clf__n_neighbors=2 ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


[CV] .................. clf__n_neighbors=2, score=0.918, total=  47.6s
[CV] clf__n_neighbors=2 ..............................................
[CV] .................. clf__n_neighbors=2, score=0.926, total=  46.7s
[CV] clf__n_neighbors=3 ..............................................
[CV] .................. clf__n_neighbors=3, score=0.932, total=  46.5s
[CV] clf__n_neighbors=3 ..............................................
[CV] .................. clf__n_neighbors=3, score=0.940, total=  44.1s
[CV] clf__n_neighbors=4 ..............................................
[CV] .................. clf__n_neighbors=4, score=0.926, total=  47.7s
[CV] clf__n_neighbors=4 ..............................................
[CV] .................. clf__n_neighbors=4, score=0.934, total=  45.8s
[CV] clf__n_neighbors=5 ..............................................
[CV] .................. clf__n_neighbors=5, score=0.928, total=  49.1s
[CV] clf__n_neighbors=5 ..............................................
[CV] .

In [None]:
print(dig_KNN_clf.best_estimator_)

Classification Report (MLP) - **Digits**

In [None]:
dig_y_pred = dig_MLP_clf.predict(dig_X_test)
print(classification_report(dig_y_test, dig_y_pred))

Classification Report (KNN) - **Digits**

In [None]:
dig_y_pred = dig_KNN_clf.predict(dig_X_test[:500])
print(classification_report(dig_y_test[:500], dig_y_pred))

Store classifiers

In [None]:
with open('dig_MLP_clf.pickle', 'wb') as f:
    pk.dump(dig_MLP_clf,f)
with open('dig_KNN_clf.pickle', 'wb') as f:
    pk.dump(dig_KNN_clf,f)