In [1]:
import h5py
import numpy as np
import os
import glob


In [2]:
%matplotlib inline

In [3]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tarfile

# Helper Functions

In [120]:
def normalize_rows(data):
    ''' 
    Normalize feature vector to be unit length feature vector
    '''
    num_rows, num_feats = data.shape
    for i in range(num_rows):
        norm = np.sqrt(np.sum(np.square(data[i, :])))
        data[i, :] = data[i, :] / norm
    return data


# Pull content from tarfiles
plt.rcParams['figure.figsize'] = (10.0, 5.0)
def plot_image(fname, plotnum=None, tarfile_dir='/tar'):
    if plotnum:
        plt.subplot(plotnum[0], plotnum[1], plotnum[2])
    prefix = fname[:3]
    tarfile_name = prefix + '.tar'
    tarfile_full_path = os.path.join(tarfile_dir, tarfile_name)
    tfile = tarfile.open(tarfile_full_path, 'r')
    img_file = tfile.extractfile(fname)
    m = mpimg.imread(img_file)
    plt.imshow(m)
    plt.title(fname)


# Load Data

In [None]:
# Store All Data in Memory
all_data = None
all_fnames = None
for fname in glob.glob('hdf5/*')[:20]:
    print 'Loading %s'%fname
    raw_data = h5py.File(fname, 'r')
    data = normalize_rows(np.array(raw_data['feats']).transpose())
    if all_fnames is not None:
        all_data = np.concatenate((all_data, data))
        all_fnames.extend(raw_data['filenames'][()].tolist())
    else:
        all_data = data
        all_fnames = raw_data['filenames'][()].tolist()
    datas.append(data)

In [144]:
# Add to Engine as you go
all_data = None
all_fnames = None
dimension = 4096 # Tie to size
rbp = RandomBinaryProjections('rbp', 100)
engine = Engine(dimension, lshashes=[rbp])

for fname in glob.glob('hdf5/*')[:3]:
    print 'Loading %s'%fname
    raw_data = h5py.File(fname, 'r')
    data = normalize_rows(np.array(raw_data['feats']).transpose())
    if all_fnames is not None:
        all_fnames.extend(raw_data['filenames'][()].tolist())
    else:
        all_fnames = raw_data['filenames'][()].tolist()
    
    if all_data is None: # So we have some stuff to test with
        all_data = data
        
    for i in range(data.shape[0]):
        engine.store_vector(data[i,:], raw_data['filenames'][i])
        total_count += 1

Loading hdf5/183.hdf5
Loading hdf5/530.hdf5
Loading hdf5/989.hdf5


# NearPy

In [153]:
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
rbp = RandomBinaryProjections('rbp', 10)
nearpy = Engine(dimension, lshashes=[rbp])
for i in range(all_data.shape[0]):
    nearpy.store_vector(all_data[i, :], all_fnames[i])

In [None]:
NUM_NEIGHBORS= 3
import time
for src_idx in range(200,210):
    # Get Source Image
    src_fname = all_fnames[src_idx]
    plt.figure()
    plot_image(src_fname, [2, NUM_NEIGHBORS, 2])
    
    # Get nearest neighbors
    nns = [x[1] for x in nearpy.neighbours(all_data[src_idx, :])][1:NUM_NEIGHBORS+1]
    for i, nn_fname in enumerate(nns):
        plot_image(nn_fname, [2, NUM_NEIGHBORS, NUM_NEIGHBORS + i+1])


# Annoy

In [141]:
from annoy import AnnoyIndex
annoy = AnnoyIndex(dimension)
total_count = 0
for i in range(all_data.shape[0]):
    annoy.add_item(total_count, all_data[i,:])
    total_count += 1
annoy.build(10) # 10 trees
annoy.save('test.ann')

True

In [None]:
NUM_NEIGHBORS= 3

for src_idx in range(0,5):
    # Get Source Image
    src_fname = all_fnames[src_idx]
    plt.figure()
    plot_image(src_fname, [2, NUM_NEIGHBORS, 2])
    
    # Get nearest neighbors
    nns = annoy.get_nns_by_item(src_idx,NUM_NEIGHBORS)
    for i, nn in enumerate(nns):
        # Add to plot
        nn_fname = all_fnames[int(nn)]
        plot_image(nn_fname, [2, NUM_NEIGHBORS, NUM_NEIGHBORS + i+1])

# MINHEAP Implementation

In [28]:
import heapq
class MinHeap:
    ''' A quick and dirty minheap implementation to test out the brute force search'''
    def __init__(self, max_size):
        self.max_size = max_size
        self.data = []
        self.max_acceptable = None

    def get_max(self):
        return self.data[self.get_max_index()][0]

    def get_max_index(self):
        return self.data.index(max(self.data))

    def insert(self, item, index):
        if len(self.data) < self.max_size:
            self.data.append((item, index))
            self.max_acceptable = self.get_max()
        elif item < self.max_acceptable:
            del self.data[self.get_max_index()]
            self.data.append((item, index))
            self.max_acceptable = self.get_max()
    def get_result(self):
        if self.max_size == 1:
            return self.data[0][1]
        else:
            return [v for (k, v) in self.data]

def get_distance(v1, v2):
    # Euclidean
    return np.sqrt(np.sum(np.square(np.subtract(v1,v2))))

def cosine_similarity(v1, v2):
    # Cosine Similarity (assuming normalized vectors)
    return np.dot(v1, v2)

def row_iteratble(matrix):
    for i in range(matrix.shape[0]):
        yield matrix[i, :]

def get_min_distances_index(source, targets, skip_index, heap_size):
    mh = MinHeap(heap_size)
    for i in range(len(targets)):
        if i != skip_index:
            mh.insert(get_distance(source, targets[i, :]), i)
    return mh.get_result()

def find_nn_index(data, index, num_results):
    source = data[index, :]
    min_index = get_min_distances_index(source, data, index, num_results)
    return min_index

In [None]:
NUM_NEIGHBORS= 3
for src_idx in range(162,170):
    nns = find_nn_index(all_data, src_idx, NUM_NEIGHBORS)
    #print src_idx, nn, get_distance(all_data[src_idx, :], all_data[nn, :])
    
    src_fname = os.path.basename(all_fnames[src_idx])
    plt.figure()
    plot_image(src_fname, [2, NUM_NEIGHBORS, 2])
    for i, nn in enumerate(nns):
        nn_fname = os.path.basename(all_fnames[nn])
        plot_image(nn_fname, [2, NUM_NEIGHBORS, NUM_NEIGHBORS + i+1])