Skip to content
b89333f Nov 12, 2017
10 contributors

### Users who have contributed to this file

279 lines (221 sloc) 8.3 KB
 """ Image hashing library ====================== Example: >>> from PIL import Image >>> import imagehash >>> hash = imagehash.average_hash(Image.open('test.png')) >>> print(hash) d879f8f89b1bbf >>> otherhash = imagehash.average_hash(Image.open('other.bmp')) >>> print(otherhash) ffff3720200ffff >>> print(hash == otherhash) False >>> print(hash - otherhash) 36 >>> for r in range(1, 30, 5): ... rothash = imagehash.average_hash(Image.open('test.png').rotate(r)) ... print('Rotation by %d: %d Hamming difference' % (r, hash - rothash)) ... Rotation by 1: 2 Hamming difference Rotation by 6: 11 Hamming difference Rotation by 11: 13 Hamming difference Rotation by 16: 17 Hamming difference Rotation by 21: 19 Hamming difference Rotation by 26: 21 Hamming difference >>> Copyright (c) 2013-2017, Johannes Buchner -- see attached LICENSE file https://github.com/JohannesBuchner/imagehash """ from __future__ import (absolute_import, division, print_function) from PIL import Image import numpy #import scipy.fftpack #import pywt import os.path __version__ = open(os.path.join(os.path.abspath( os.path.dirname(__file__)), 'VERSION')).read().strip() def _binary_array_to_hex(arr): """ internal function to make a hex string out of a binary array. """ bit_string = ''.join(str(b) for b in 1 * arr.flatten()) width = int(numpy.ceil(len(bit_string)/4)) return '{:0>{width}x}'.format(int(bit_string, 2), width=width) class ImageHash(object): """ Hash encapsulation. Can be used for dictionary keys and comparisons. """ def __init__(self, binary_array): self.hash = binary_array def __str__(self): return _binary_array_to_hex(self.hash.flatten()) def __repr__(self): return repr(self.hash) def __sub__(self, other): if other is None: raise TypeError('Other hash must not be None.') if self.hash.size != other.hash.size: raise TypeError('ImageHashes must be of the same shape.', self.hash.shape, other.hash.shape) return numpy.count_nonzero(self.hash.flatten() != other.hash.flatten()) def __eq__(self, other): if other is None: return False return numpy.array_equal(self.hash.flatten(), other.hash.flatten()) def __ne__(self, other): if other is None: return False return not numpy.array_equal(self.hash.flatten(), other.hash.flatten()) def __hash__(self): # this returns a 8 bit integer, intentionally shortening the information return sum([2**(i % 8) for i, v in enumerate(self.hash.flatten()) if v]) def hex_to_hash(hexstr): """ Convert a stored hash (hex, as retrieved from str(Imagehash)) back to a Imagehash object. Notes: 1. This algorithm assumes all hashes are bidimensional arrays with dimensions hash_size * hash_size. 2. This algorithm does not work for hash_size < 2. """ hash_size = int(numpy.sqrt(len(hexstr)*4)) binary_array = '{:0>{width}b}'.format(int(hexstr, 16), width = hash_size * hash_size) bit_rows = [binary_array[i:i+hash_size] for i in range(0, len(binary_array), hash_size)] hash_array = numpy.array([[bool(int(d)) for d in row] for row in bit_rows]) return ImageHash(hash_array) def old_hex_to_hash(hexstr, hash_size=8): """ Convert a stored hash (hex, as retrieved from str(Imagehash)) back to a Imagehash object. This method should be used for hashes generated by ImageHash up to version 3.7. For hashes generated by newer versions of ImageHash, hex_to_hash should be used instead. """ l = [] count = hash_size * (hash_size // 4) if len(hexstr) != count: emsg = 'Expected hex string size of {}.' raise ValueError(emsg.format(count)) for i in range(count // 2): h = hexstr[i*2:i*2+2] v = int("0x" + h, 16) l.append([v & 2**i > 0 for i in range(8)]) return ImageHash(numpy.array(l)) def average_hash(image, hash_size=8): """ Average Hash computation Implementation follows http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html Step by step explanation: https://www.safaribooksonline.com/blog/2013/11/26/image-hashing-with-python/ @image must be a PIL instance. """ if hash_size < 2: raise ValueError("Hash size must be greater than or equal to 2") # reduce size and complexity, then covert to grayscale image = image.convert("L").resize((hash_size, hash_size), Image.ANTIALIAS) # find average pixel value; 'pixels' is an array of the pixel values, ranging from 0 (black) to 255 (white) pixels = numpy.asarray(image) avg = pixels.mean() # create string of bits diff = pixels > avg # make a hash return ImageHash(diff) def phash(image, hash_size=8, highfreq_factor=4): """ Perceptual Hash computation. Implementation follows http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html @image must be a PIL instance. """ if hash_size < 2: raise ValueError("Hash size must be greater than or equal to 2") import scipy.fftpack img_size = hash_size * highfreq_factor image = image.convert("L").resize((img_size, img_size), Image.ANTIALIAS) pixels = numpy.asarray(image) dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1) dctlowfreq = dct[:hash_size, :hash_size] med = numpy.median(dctlowfreq) diff = dctlowfreq > med return ImageHash(diff) def phash_simple(image, hash_size=8, highfreq_factor=4): """ Perceptual Hash computation. Implementation follows http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html @image must be a PIL instance. """ import scipy.fftpack img_size = hash_size * highfreq_factor image = image.convert("L").resize((img_size, img_size), Image.ANTIALIAS) pixels = numpy.asarray(image) dct = scipy.fftpack.dct(pixels) dctlowfreq = dct[:hash_size, 1:hash_size+1] avg = dctlowfreq.mean() diff = dctlowfreq > avg return ImageHash(diff) def dhash(image, hash_size=8): """ Difference Hash computation. following http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html computes differences horizontally @image must be a PIL instance. """ # resize(w, h), but numpy.array((h, w)) if hash_size < 2: raise ValueError("Hash size must be greater than or equal to 2") image = image.convert("L").resize((hash_size + 1, hash_size), Image.ANTIALIAS) pixels = numpy.asarray(image) # compute differences between columns diff = pixels[:, 1:] > pixels[:, :-1] return ImageHash(diff) def dhash_vertical(image, hash_size=8): """ Difference Hash computation. following http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html computes differences vertically @image must be a PIL instance. """ # resize(w, h), but numpy.array((h, w)) image = image.convert("L").resize((hash_size, hash_size + 1), Image.ANTIALIAS) pixels = numpy.asarray(image) # compute differences between rows diff = pixels[1:, :] > pixels[:-1, :] return ImageHash(diff) def whash(image, hash_size = 8, image_scale = None, mode = 'haar', remove_max_haar_ll = True): """ Wavelet Hash computation. based on https://www.kaggle.com/c/avito-duplicate-ads-detection/ @image must be a PIL instance. @hash_size must be a power of 2 and less than @image_scale. @image_scale must be power of 2 and less than image size. By default is equal to max power of 2 for an input image. @mode (see modes in pywt library): 'haar' - Haar wavelets, by default 'db4' - Daubechies wavelets @remove_max_haar_ll - remove the lowest low level (LL) frequency using Haar wavelet. """ import pywt if image_scale is not None: assert image_scale & (image_scale - 1) == 0, "image_scale is not power of 2" else: image_natural_scale = 2**int(numpy.log2(min(image.size))) image_scale = max(image_natural_scale, hash_size) ll_max_level = int(numpy.log2(image_scale)) level = int(numpy.log2(hash_size)) assert hash_size & (hash_size-1) == 0, "hash_size is not power of 2" assert level <= ll_max_level, "hash_size in a wrong range" dwt_level = ll_max_level - level image = image.convert("L").resize((image_scale, image_scale), Image.ANTIALIAS) pixels = numpy.asarray(image) / 255 # Remove low level frequency LL(max_ll) if @remove_max_haar_ll using haar filter if remove_max_haar_ll: coeffs = pywt.wavedec2(pixels, 'haar', level = ll_max_level) coeffs = list(coeffs) coeffs *= 0 pixels = pywt.waverec2(coeffs, 'haar') # Use LL(K) as freq, where K is log2(@hash_size) coeffs = pywt.wavedec2(pixels, mode, level = dwt_level) dwt_low = coeffs # Substract median and compute hash med = numpy.median(dwt_low) diff = dwt_low > med return ImageHash(diff)
You can’t perform that action at this time.