Imports

In [31]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from skimage.morphology import skeletonize
from sklearn.neighbors import KNeighborsClassifier
import os
from sklearn import svm, metrics
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from __future__ import division
from scipy.signal import convolve2d
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold,train_test_split, cross_val_score
from statistics import mean, stdev
tuning = False

Functions

In [32]:
def standardize_train(X):
    mu = np.mean(X,axis=0)
    sigma = np.std(X,axis=0)
    normalized_X = (X-mu)/sigma
    
    return normalized_X, mu, sigma

def standardize_test(X,mu,sigma):
    normalized_X = (X-mu)/sigma
    return normalized_X

In [33]:
## Reference: https://stackoverflow.com/questions/23548863/converting-a-specific-matlab-script-to-python/23575137
#Perform local Phase Quantization
def lpq(img, winSize = 3, freqestim = 1, mode = 'nh'):
    
    rho = 0.90
    STFTalpha = 1/winSize    # alpha in STFT approaches (for Gaussian derivative alpha=1)
    sigmaS = (winSize-1)/4   # Sigma for STFT Gaussian window (applied if freqestim==2)
    sigmaA = 8/(winSize-1)   # Sigma for Gaussian derivative quadrature filters (applied if freqestim==3)

    convmode = 'valid'   # Compute descriptor responses only on part that have full neigborhood. Use 'same' if all pixels are included (extrapolates np.image with zeros).

    img = np.float64(img)                # Convert np.image to double
    r = (winSize-1)/2                    # Get radius from window size
    x = np.arange(-r, r+1)[np.newaxis]   # Form spatial coordinates in window

    if freqestim == 1:  #  STFT uniform window
        # Basic STFT filters
        w0 = np.ones_like(x)
        w1 = np.exp(-2*np.pi*x*STFTalpha*1j)
        w2 = np.conj(w1)

    ## Run filters to compute the frequency response in the four points. Store np.real and np.imaginary parts separately
    # Run first filter
    filterResp1 = convolve2d(convolve2d(img,w0.T,convmode),w1,convmode)
    filterResp2 = convolve2d(convolve2d(img,w1.T,convmode),w0,convmode)
    filterResp3 = convolve2d(convolve2d(img,w1.T,convmode),w1,convmode)
    filterResp4 = convolve2d(convolve2d(img,w1.T,convmode),w2,convmode)

    # Initilize frequency domain matrix for four frequency coordinates (np.real and np.imaginary parts for each frequency).
    freqResp = np.dstack([filterResp1.real, filterResp1.imag,
                         filterResp2.real, filterResp2.imag,
                         filterResp3.real, filterResp3.imag,
                         filterResp4.real, filterResp4.imag])

    ## Perform quantization and compute LPQ codewords
    inds = np.arange(freqResp.shape[2])[np.newaxis,np.newaxis,:]
    LPQdesc = ((freqResp>0)*(2**inds)).sum(2)

    ## Switch format to uint8 if LPQ code np.image is required as output
    if mode =='im':
        LPQdesc = np.uint8(LPQdesc)

    ## Histogram if needed
    if mode == 'nh' or mode == 'h':
        LPQdesc = np.histogram(LPQdesc.flatten(),range(256))[0]

    ## Normalize histogram if needed
    if mode == 'nh':
        LPQdesc = LPQdesc/LPQdesc.sum()

    #print(LPQdesc)
    return LPQdesc

## Preprocessing

In [34]:
# 1. Perform Preprocessing
def preprocessing(image):
    ret2,th2 = cv2.threshold(image, 0, 1, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    result = np.array(th2, dtype = 'float')
    return result

## Feature Extraction

In [35]:
# 2. Feature Extraction using Local Phase Quantization
def extractFeatures(image):
    return lpq(image, winSize = 5, mode ='nh')

## Classification

In [36]:
data = []
data_labels = []
# Read font names from file
fontFile = open("names.txt",'r')
fonts = np.loadtxt(fontFile, dtype='str')
for font in fonts:
    fontDir, fontName = font.split("___")
    print(fontDir, fontName)
    for file in os.listdir(fontDir):
        image = cv2.imread(fontDir+"/"+file,0)
        image_processed = preprocessing(image)
        data.append(image_processed)
        data_labels.append(fontName)

# Convert data to numpy array
data = np.asarray(data, dtype=np.ndarray)
data_labels = np.asarray(data_labels)

1 diwani
2 naskh
3 parsi
4 rekaa
5 thuluth
6 maghribi
7 kufi
8 mohakek
9 Squar-kufic


In [37]:
N = data.shape[0]
trainFeatures = np.zeros((N, 255))

for i in range(trainFeatures.shape[0]):
    trainFeatures[i] = extractFeatures(data[i])

standardized_train, mu, sigma = standardize_train(trainFeatures)
standardized_test = standardize_test(trainFeatures,mu,sigma)
y_train = data_labels

## Hyper-parameter Tuning

In [38]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
C_values = [2**(-5),2**(-3),2**(-1),2**1,2**3,2**5,2**7,2**9,2**10,2**11]
gamma_values = [2**(-15), 2**(-13), 2**(-11), 2**(-9), 2**(-7), 2**(-5), 2**(-3), 2**(-1), 2**(1),2**(3)]
kernels = ['rbf', 'poly', 'sigmoid']
maximum_acc = float('-inf')
maximum = []
stdv = 0
best_C = None
best_gamma = None
best_kernel = None
if tuning:
  for C_test in C_values:
    for gamma_test in gamma_values:
      for kernel_test in kernels:
        clf_accu_stratified = []
        clf_svm = svm.SVC(C=C_test, gamma=gamma_test, kernel=kernel_test)
        for train_index, test_index in skf.split(trainFeatures,y_train):
          x_train_fold, x_test_fold = standardized_train[train_index], standardized_test[test_index]
          y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
          clf_svm.fit(x_train_fold, y_train_fold)
          clf_accu_stratified.append(clf_svm.score(x_test_fold, y_test_fold))
        acc = mean(clf_accu_stratified)*100
        print('\nOverall Accuracy:', acc, '%')
        print('\nStandard Deviation is:', stdev(clf_accu_stratified))
        if acc >= maximum_acc:
          maximum_acc = acc
          best_C = C_test
          best_gamma = gamma_test
          best_kernel = kernel_test
          maximum.append((acc, best_C, best_gamma, best_kernel))
          stdv = stdev(clf_accu_stratified)


Overall Accuracy: 11.572700296735905 %

Standard Deviation is: 0.0

Overall Accuracy: 11.572700296735905 %

Standard Deviation is: 0.0

Overall Accuracy: 11.572700296735905 %

Standard Deviation is: 0.0

Overall Accuracy: 40.474777448071215 %

Standard Deviation is: 0.01320391422034836

Overall Accuracy: 11.572700296735905 %

Standard Deviation is: 0.0

Overall Accuracy: 11.572700296735905 %

Standard Deviation is: 0.0

Overall Accuracy: 41.66172106824926 %

Standard Deviation is: 0.017754629438162136

Overall Accuracy: 17.804154302670625 %

Standard Deviation is: 0.05013870993684214

Overall Accuracy: 38.21958456973294 %

Standard Deviation is: 0.011757633683180396

Overall Accuracy: 67.83382789317508 %

Standard Deviation is: 0.011415658196837587

Overall Accuracy: 25.341246290801188 %

Standard Deviation is: 0.04656957827324465

Overall Accuracy: 57.32937685459941 %

Standard Deviation is: 0.014747505803904026

Overall Accuracy: 85.4005934718101 %

Standard Deviation is: 0.02016931

In [39]:
if tuning:
  maximum.sort(reverse=True)
  print(maximum[0])
  print(maximum[1])
  print(maximum[2])
  print(maximum[3])
  print(maximum[4])

(99.46587537091989, 32, 0.001953125, 'rbf')
(99.28783382789318, 8, 0.001953125, 'rbf')
(98.57566765578635, 2, 8, 'poly')
(98.57566765578635, 2, 2, 'poly')
(98.57566765578635, 2, 0.5, 'poly')


## Combining Classifiers

In [40]:
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_svm1 = svm.SVC(C=32, gamma=0.001953125, kernel='rbf')
clf_svm2 = svm.SVC(C=8, gamma=0.001953125, kernel='rbf')
clf_svm3 = svm.SVC(C=2, gamma=8, kernel='poly')
clf_svm4 = svm.SVC(C=2, gamma=2, kernel='poly')

Majority vote using stratified cross validation k = 10

In [41]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
for train_index, test_index in skf.split(trainFeatures,y_train):
        x_train_fold, x_test_fold = standardized_train[train_index], standardized_test[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
        clf_max = VotingClassifier(estimators=[('knn', clf_knn),('svm1', clf_svm1),('svm2', clf_svm2),('svm3', clf_svm3),('svm4', clf_svm4)], voting='hard')

        clf_max.fit(x_train_fold, y_train_fold)
        y_pred_max = clf_max.predict(x_test_fold)
        print("Accuracy-Majority Vote:",metrics.accuracy_score(y_test_fold, y_pred_max)*100)
        print("F1 Score Micro-Majority Vote:",metrics.f1_score(y_test_fold, y_pred_max, average='micro')*100)
        print('--------------------------------------------------')


Accuracy-Majority Vote: 100.0
F1 Score Micro-Majority Vote: 100.0
--------------------------------------------------
Accuracy-Majority Vote: 99.40828402366864
F1 Score Micro-Majority Vote: 99.40828402366864
--------------------------------------------------
Accuracy-Majority Vote: 100.0
F1 Score Micro-Majority Vote: 100.0
--------------------------------------------------
Accuracy-Majority Vote: 99.40828402366864
F1 Score Micro-Majority Vote: 99.40828402366864
--------------------------------------------------
Accuracy-Majority Vote: 100.0
F1 Score Micro-Majority Vote: 100.0
--------------------------------------------------
Accuracy-Majority Vote: 98.21428571428571
F1 Score Micro-Majority Vote: 98.21428571428571
--------------------------------------------------
Accuracy-Majority Vote: 98.80952380952381
F1 Score Micro-Majority Vote: 98.80952380952381
--------------------------------------------------
Accuracy-Majority Vote: 100.0
F1 Score Micro-Majority Vote: 100.0
-------------------

Majority Vote using stratified cross validation k = 5

In [42]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for train_index, test_index in skf.split(trainFeatures,y_train):
        x_train_fold, x_test_fold = standardized_train[train_index], standardized_test[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
        clf_max = VotingClassifier(estimators=[('knn', clf_knn),('svm1', clf_svm1),('svm2', clf_svm2),('svm3', clf_svm3),('svm4', clf_svm4)], voting='hard')

        clf_max.fit(x_train_fold, y_train_fold)
        y_pred_max = clf_max.predict(x_test_fold)
        print("Accuracy-Majority Vote:",metrics.accuracy_score(y_test_fold, y_pred_max)*100)
        print("F1 Score Micro-Majority Vote:",metrics.f1_score(y_test_fold, y_pred_max, average='micro')*100)
        print('--------------------------------------------------')

Accuracy-Majority Vote: 100.0
F1 Score Micro-Majority Vote: 100.0
--------------------------------------------------
Accuracy-Majority Vote: 100.0
F1 Score Micro-Majority Vote: 100.0
--------------------------------------------------
Accuracy-Majority Vote: 99.10979228486647
F1 Score Micro-Majority Vote: 99.10979228486647
--------------------------------------------------
Accuracy-Majority Vote: 99.40652818991099
F1 Score Micro-Majority Vote: 99.40652818991099
--------------------------------------------------
Accuracy-Majority Vote: 98.81305637982196
F1 Score Micro-Majority Vote: 98.81305637982196
--------------------------------------------------


Majority Vote using normal cross validation k = 10

In [43]:

clf_max = VotingClassifier(estimators=[('knn', clf_knn),('svm1', clf_svm1),('svm2', clf_svm2),('svm3', clf_svm3),('svm4', clf_svm4)], voting='hard')
scores = cross_val_score(clf_max, standardized_train, y_train, cv=10, scoring='f1_macro')
print(scores) 
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.95279675 0.99399399 0.96261002 1.         1.         1.
 1.         1.         0.9755913  0.92782858]
0.98 accuracy with a standard deviation of 0.02


Majority Vote using normal cross validation k = 5

In [44]:
clf_max = VotingClassifier(estimators=[('knn', clf_knn),('svm1', clf_svm1),('svm2', clf_svm2),('svm3', clf_svm3),('svm4', clf_svm4)], voting='hard')
scores = cross_val_score(clf_max, standardized_train, y_train, cv=5, scoring='f1_macro')
print(scores) 
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.94824431 0.96991659 1.         0.99397639 0.93286553]
0.97 accuracy with a standard deviation of 0.03


## Soft Majority Vote (Sum)

In [45]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
clf_svm1 = svm.SVC(C=32, gamma=0.001953125, kernel='rbf',probability=True)
clf_svm2 = svm.SVC(C=8, gamma=0.001953125, kernel='rbf',probability=True)
clf_svm3 = svm.SVC(C=2, gamma=8, kernel='poly',probability=True)
clf_svm4 = svm.SVC(C=2, gamma=2, kernel='poly',probability=True)
for train_index, test_index in skf.split(trainFeatures,y_train):
        x_train_fold, x_test_fold = standardized_train[train_index], standardized_test[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
        clf_sum = VotingClassifier(estimators=[('knn', clf_knn),('svm1', clf_svm1),('svm2', clf_svm2),('svm3', clf_svm3),('svm4', clf_svm4)], voting='soft')
        clf_sum.fit(x_train_fold, y_train_fold)
        y_pred_max = clf_sum.predict(x_test_fold)
        print("Accuracy - Majority Vote:",metrics.accuracy_score(y_test_fold, y_pred_max)*100)
        print("F1 Score - Micro:",metrics.f1_score(y_test_fold, y_pred_max, average='micro')*100)
        print('_________________________________________')

Accuracy - Majority Vote: 100.0
F1 Score - Micro: 100.0
_________________________________________
Accuracy - Majority Vote: 99.40828402366864
F1 Score - Micro: 99.40828402366864
_________________________________________
Accuracy - Majority Vote: 100.0
F1 Score - Micro: 100.0
_________________________________________
Accuracy - Majority Vote: 100.0
F1 Score - Micro: 100.0
_________________________________________
Accuracy - Majority Vote: 100.0
F1 Score - Micro: 100.0
_________________________________________
Accuracy - Majority Vote: 98.80952380952381
F1 Score - Micro: 98.80952380952381
_________________________________________
Accuracy - Majority Vote: 98.80952380952381
F1 Score - Micro: 98.80952380952381
_________________________________________
Accuracy - Majority Vote: 100.0
F1 Score - Micro: 100.0
_________________________________________
Accuracy - Majority Vote: 99.40476190476191
F1 Score - Micro: 99.40476190476191
_________________________________________
Accuracy - Majority Vo