In [1]:
import os, sys
import matplotlib.pyplot as plt
import numpy as np
from sklearn import decomposition, manifold

% matplotlib notebook

In [2]:
def compute_distance(x,y):
    m = np.empty([len(y),len(x)])
    for i in range(len(y)):
        m[i] = np.abs( x - y[i] ).sum(axis=1)
    m = m / np.linalg.norm(m, axis = 0)
    return np.min(m, axis = 0).sum() / len(x)

In [3]:
def print_percentage(n, t):
    sys.stdout.write('\r')
    sys.stdout.write("[%-20s] %d%%" % ('=' * ((n * 20/t) + 1) , n * 100/t + 1 ))
    if n == t: sys.stdout.write('\n')
    sys.stdout.flush()
    

## Load files from directory
Load all of the numpy arrays from disk. One array will be n by 4096 images, where n in the number of handwriting samples, and will contain the means across all features. This will be used to create the PCA model which will be used to reduce the dimensionality of each patch.

The second array will maintain the extracted features for each patch from each sample.

In [4]:
#feature_dir = '/fileserver/nmec-handwriting/localfeatures/first-pass/'
feature_dir = '/fileserver/nmec-handwriting/localfeatures/nmec_bw_cc_deNNiam_fiel657_min500'

files = os.listdir(feature_dir)
files.sort()

mean_feats = []
all_feats  = []
for i,filename in enumerate(files):
    path = feature_dir + "/" + filename
    #if os.path.isfile(path) and ( '004.' in filename or '007.' in filename):
    if os.path.isfile(path):
        x = np.load(path)
        mean_feats.append( x.mean(axis=0) )
        all_feats.append(x)
    print_percentage(i, len(files))
sys.stdout.write('\n')
sys.stdout.flush()
        
mean_feats = np.array(mean_feats)
all_feats  = np.array(all_feats)
print mean_feats.shape
print all_feats[0].shape

(708, 4096)
(76, 4096)


Create a PCA model based on feature means for all samples

In [5]:
pca = decomposition.PCA(n_components=128)
train = mean_feats
train_reduced = pca.fit_transform(train)

Iterate over each sample, reducing dimensionality to 128 for all patches

In [6]:
all_reduced = np.array([pca.transform(sample) for sample in all_feats])

Calculate distance between samples

In [7]:
metric = []
for i, image in enumerate(all_reduced):
    metricline = [np.array([compute_distance(image, other) for other in all_reduced])]
    metric += metricline
    print_percentage(i, len(all_reduced))

metric = np.array(metric)
F = -metric
np.fill_diagonal(F, -sys.maxint)



In [8]:
soft_correct = 0
hard_correct = 0
total_num = 0

k = 10
g = 8
max_top = 3

for j, i in enumerate(F):
    
    total_num += 1
    topk = i.argsort()[-k:]
    
    if files[j][:6] in (files[index][:6] for index in topk):
        soft_correct += 1
    
    hardsample = list(files[index][3:6] for index in topk[-max_top:])
    if len(set(hardsample)) == 1 and hardsample[0] == files[j][3:6]:
        print "%s matched %s" % (files[j][3:10], hardsample)
        hard_correct += 1

print "%-30s" % ( "-" * 37 )
print "SOFT CRITERIA: Top %d\t= %f" %(k, (soft_correct + 0.0) / total_num)
print "HARD CRITERIA: Top %d\t= %f" %(max_top, (hard_correct + 0.0) / total_num)

056-002 matched ['056', '056', '056']
-------------------------------------
SOFT CRITERIA: Top 10	= 0.186441
HARD CRITERIA: Top 3	= 0.001412
