In [79]:
import cv, cv2
import numpy as np
import os
import skimage
import skimage.io as io

sift = cv2.SIFT()

def compute_images_sift_features(images):
    
    return map(lambda image: sift.detectAndCompute(image, None)[1], images)

def train_classifier(images, number_of_clusters=1000):

    img_number = len(images)

    # Detect and compute Sift descriptor features in each image
    features = compute_images_sift_features(images)

    # Get the index of where for each feature the respective training image label
    # is provided. This is done to compute histogram descriptors of each training sample
    training_set_indexes = np.asarray([])

    for ind, feature in enumerate(features):

        amount_of_elements = feature.shape[0]
        indexes_to_append = np.repeat(ind, amount_of_elements)
        training_set_indexes = np.append(training_set_indexes, indexes_to_append)

    # Put all the descriptors in one array to find clusters (bag of words)
    training_set = np.vstack(features)

    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)

    _, labels, centers = cv2.kmeans(training_set, number_of_clusters, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)

    # Make labels array as one dimensional list
    labels = np.concatenate(labels)

    image_descriptors = np.zeros((img_number, number_of_clusters))

    for current_img_number in range(img_number):

        # Get the words of the current image
        current_img_words = labels[training_set_indexes == current_img_number]

        current_img_words_number = len(current_img_words)

        current_histogram = np.bincount(current_img_words, minlength=number_of_clusters).astype(np.float32)

        # Normalize histogram of the current word
        current_histogram = current_histogram / current_img_words_number

        image_descriptors[current_img_number, :] = current_histogram

    return centers, image_descriptors

def get_images_filenames(images_folder, image_categories_folders,
                               amount_of_images_to_take=8, fetch_from_beginnig=True):

    images_filenames_list = []
    
    labels = np.repeat(np.arange(amount_of_first_images_to_take), 2)
    
    for current_category_folder in image_categories_folders:

        # Get the full path to current category folder
        current_category_folder_full_path = os.path.join(images_folder, current_category_folder)

        # Get all the files in current category directory folder
        current_category_filenames = os.listdir(current_category_folder_full_path)

        # Sort all the filename in lexigraphical order. This is to get the filenames
        # sorted like 01.jpg, 02.jpg, 03.jpg and so on.
        current_category_filenames.sort()

        # Take the images from the beginning or from the end.
        if fetch_from_beginnig:
            images_filenames_to_add = current_category_filenames[:amount_of_images_to_take]
        else:
            images_filenames_to_add = current_category_filenames[-amount_of_images_to_take:]

        images_filenames_to_add = map(lambda x: os.path.join(current_category_folder_full_path, x), images_filenames_to_add)

        images_filenames_list.extend(images_filenames_to_add)

    return images_filenames_list, labels

def closest_cluster(feature_vector, clusters):
    
    return np.argmin(((feature_vector - clusters)**2).sum(axis=1))

def compute_bag_of_words_repr(feature_vect, clusters):
    
    number_of_clusters = clusters.shape[0]
    
    img_words_number = feature_vect.shape[0]
    
    bag_of_words_count = np.asarray(map(lambda x: closest_cluster(x, clusters), feature_vect))
    
    bag_of_words_count = np.bincount(bag_of_words_count, minlength=number_of_clusters).astype(np.float32)
    
    bag_of_words_frequency = bag_of_words_count / img_words_number
    
    return bag_of_words_frequency

def compute_bag_of_words_repr_batch(feature_vectors, clusters):
    
    result = map(lambda x: compute_bag_of_words_repr(x, clusters), feature_vectors)
    
    return np.vstack(result)

def images_to_bag_of_words_histogram(images, clusters):
    
    images_features = compute_images_sift_features(images)
    images_histograms = compute_bag_of_words_repr_batch(images_features, centers)
    
    return images_histograms



In [2]:
images_folder = 'images'
# image_categories_folders = ['buildings', 'cars', 'faces', 'food', 'people', 'trees']
# amount_of_first_images_to_take = 9
image_categories_folders = ['buildings', 'cars']
amount_of_first_images_to_take = 1

In [80]:
train_images_filenames = get_images_filenames(images_folder, image_categories_folders,
                                              amount_of_first_images_to_take)

test_images_filenames = get_images_filenames(images_folder, image_categories_folders,
                                            amount_of_first_images_to_take, fetch_from_beginnig=False)

print train_images_filenames
print test_images_filenames

train_images = io.imread_collection(train_images_filenames)
test_images = io.imread_collection(test_images_filenames)

cluster_centers, train_images_histograms = train_classifier(train_images)

test_images_histograms = images_to_bag_of_words_histogram(test_images, cluster_centers)



['images/buildings/01.jpg', 'images/cars/01.jpg']
['images/buildings/11.jpg', 'images/cars/11.jpg']


In [88]:
np.repeat(np.arange(2), 2)

array([0, 0, 1, 1])