In [1]:
from pathlib import Path
import imagehash
from PIL import Image
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor

In [2]:
THREADS = 24

In [3]:
images_dir = Path('/pool/genomics/triznam/nmah_dupes/dupe_images/nmah/DataScience')

In [4]:
files_list = [file for file in images_dir.rglob('*') if file.is_file()]
len(files_list)

9466

In [5]:
def compute_hashes(image_path):
    return_dict = {'image':str(image_path)}
    try:
        with Image.open(image_path) as im:
            return_dict['width'], return_dict['height'] = im.size
            resized = im.convert("L").resize((100,100), Image.ANTIALIAS)
            return_dict['av_hash'] = str(imagehash.average_hash(resized))
            return_dict['phash'] = str(imagehash.phash(resized))
            return_dict['dhash'] = str(imagehash.dhash(resized))
            return_dict['whash'] = str(imagehash.whash(resized))
    except:
        return_dict['width'], return_dict['height'] = np.nan, np.nan
    return return_dict

In [6]:
hash_results = list(ProcessPoolExecutor(THREADS).map(compute_hashes, files_list))

In [7]:
hash_df = pd.DataFrame(hash_results)
hash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9466 entries, 0 to 9465
Data columns (total 7 columns):
image      9466 non-null object
width      9439 non-null float64
height     9439 non-null float64
av_hash    9439 non-null object
phash      9439 non-null object
dhash      9439 non-null object
whash      9439 non-null object
dtypes: float64(2), object(5)
memory usage: 517.8+ KB


In [8]:
hash_df.head()

Unnamed: 0,image,width,height,av_hash,phash,dhash,whash
0,/pool/genomics/triznam/nmah_dupes/dupe_images/...,2000.0,1961.0,00005434383000ff,c638b16f9736c4c8,7290b4cd61612382,1a587e7c3c3010ff
1,/pool/genomics/triznam/nmah_dupes/dupe_images/...,1024.0,794.0,ff8180818181d9f3,ff15909ec598c4c3,332f070347052b07,ff818181a181d9ff
2,/pool/genomics/triznam/nmah_dupes/dupe_images/...,1024.0,963.0,003c7e7efe7e7c18,c41c1f6f3832699d,71e0e0a0a2c2e870,003c7e7c7e7a7c10
3,/pool/genomics/triznam/nmah_dupes/dupe_images/...,1024.0,670.0,913978f2d0588ece,d06cbb6972f440bc,2373f4a6a6d01c18,91397cf2d0588ece
4,/pool/genomics/triznam/nmah_dupes/dupe_images/...,1024.0,672.0,001c7e7e3e181800,9cb26b4d63c3b294,33f4d4d4f4f0b231,107e7e7e7e3e1800


In [9]:
hash_df.to_csv('dupe_test_hash_results.tsv', sep='\t', index=False)