In [32]:
import glob
import numpy as np 
from PIL import Image
from itertools import combinations
import os

DATASET_PATH = './dev_dataset/'

In [2]:
images = np.array(sorted(glob.glob(DATASET_PATH+'*.jpg')))

In [3]:
indecies = np.array(list(combinations(range(len(images)),2)))

In [4]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

In [10]:
def median_image_hashing(image_path, hash_dims=(8,8)):
    '''
    Median image hashing algorithm
    '''
    image_pil = Image.open(image_path)
    resized_im = np.array(image_pil.resize(hash_dims))
    gray_im = rgb2gray(resized_im)
    binarized = (gray_im > np.median(gray_im)).astype(np.uint8)
    average_hash = binarized.flatten()
    return average_hash

In [11]:
def hamming_distance(hash_pair):
    return sum((hash_pair[0]!=hash_pair[1]).astype(np.uint8))

In [12]:
image_hashes = np.array([median_image_hashing(path) for path in images])

In [13]:
hash_pairs = image_hashes[indecies]

In [14]:
distance_values = np.array(list(map(hamming_distance, hash_pairs)))

In [15]:
dup_indecies = np.argwhere(distance_values<=6)

In [16]:
images[indecies[dup_indecies]].reshape(-1,2)

array([['./dev_dataset/1.jpg', './dev_dataset/1_duplicate.jpg'],
       ['./dev_dataset/11.jpg', './dev_dataset/11_duplicate.jpg'],
       ['./dev_dataset/11.jpg', './dev_dataset/11_modification.jpg'],
       ['./dev_dataset/11_duplicate.jpg',
        './dev_dataset/11_modification.jpg'],
       ['./dev_dataset/15.jpg', './dev_dataset/15_modification.jpg']],
      dtype='<U33')

###  Hmm.  Not all of dups were found. Seems that  median hashing is note translation invariant. Let's try out comparing image color distribution. 

# Image histograms

In [17]:
def color_histogram(image_path, bins=20, dims = (50,50)):
    image_pil = Image.open(image_path)
    resized_im = np.array(image_pil.resize(dims))
    gray_im = rgb2gray(resized_im)
    return np.histogram(gray_im,bins=np.linspace(0,255,bins))[0] 

In [18]:
def histogram_intersection(hist_pair):
    minima = np.minimum(hist_pair[0], hist_pair[1])
    intersection = np.true_divide(np.sum(minima), np.sum(hist_pair[1]))
    return intersection 

In [19]:
color_distribution = np.array([color_histogram(path) for path in images])

In [20]:
dist_pairs = color_distribution[indecies]

In [21]:
intersection_values = np.array(list(map(histogram_intersection, dist_pairs)))

In [22]:
dup_indecies = np.argwhere(intersection_values>=.9)

In [23]:
images[indecies[dup_indecies]].reshape(-1,2)

array([['./dev_dataset/1.jpg', './dev_dataset/1_duplicate.jpg'],
       ['./dev_dataset/11.jpg', './dev_dataset/11_duplicate.jpg'],
       ['./dev_dataset/11.jpg', './dev_dataset/11_modification.jpg'],
       ['./dev_dataset/11_duplicate.jpg',
        './dev_dataset/11_modification.jpg'],
       ['./dev_dataset/15.jpg', './dev_dataset/15_modification.jpg'],
       ['./dev_dataset/4.jpg', './dev_dataset/4_similar.jpg'],
       ['./dev_dataset/6.jpg', './dev_dataset/6_similar.jpg']],
      dtype='<U33')

### Yay! All dups seem to be found. I'll combine both of the algorithms just to be on the safe side :)

In [35]:
def find_dups_from_dir(data_dir, hash_threshold=6, hist_threshold=.9):
    
    if not os.path.exists(data_dir):
        print('Bad boy')
        return  
    
    images_filenames = np.array(glob.glob(data_dir+'/*'))
    possible_ind_combs = np.array(list(combinations(range(len(images_filenames)),2)))
    image_hashes = np.array([median_image_hashing(path) for path in images_filenames])
    color_distribution = np.array([color_histogram(path) for path in images_filenames])
    hash_pairs, dist_pairs = image_hashes[possible_ind_combs], color_distribution[possible_ind_combs]
    hamming_distances = np.array(list(map(hamming_distance, hash_pairs)))
    histogram_intersections =  np.array(list(map(histogram_intersection, dist_pairs)))
    
    
    mask1 = hamming_distances<hash_threshold
    mask2 = histogram_intersections>hist_threshold
    final_mask = mask1 | mask2
    dup_ind = np.argwhere(final_mask)
    image_ind = possible_ind_combs[dup_ind]
    
    return np.squeeze(images_filenames[image_ind])

In [41]:
find_dups_from_dir('./dev_dataset/')

array([['./dev_dataset/11_duplicate.jpg', './dev_dataset/11.jpg'],
       ['./dev_dataset/11_duplicate.jpg',
        './dev_dataset/11_modification.jpg'],
       ['./dev_dataset/6_similar.jpg', './dev_dataset/6.jpg'],
       ['./dev_dataset/15.jpg', './dev_dataset/15_modification.jpg'],
       ['./dev_dataset/11.jpg', './dev_dataset/11_modification.jpg'],
       ['./dev_dataset/4.jpg', './dev_dataset/4_similar.jpg'],
       ['./dev_dataset/1.jpg', './dev_dataset/1_duplicate.jpg']],
      dtype='<U33')