In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

    
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"location/").convert("L").resize((HashSize+1,HashSize),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

def FindDuplicates(Directory:str,threshold:float,HashSize:int,bands:int) -> List[Tuple[str,str,float]]:
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in FileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SBand = Signature[i*rows:(i+1)*rows]
            SBandBytes = SBand.tobytes()
            if SBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SBandBytes] = list()
            HashBucketsList[i][SBandBytes].append(fl)

    # Build candidates based on bucket
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBuckets in HashBuckets.values():
            if len(HashBuckets) > 1:
                HashBuckets = sorted(HashBuckets)
                for l in range(len(HashBuckets)):
                    for m in range(l+1, len(HashBuckets)):
                        Candidates.add(tuple([HashBuckets[l],HashBuckets[m]]))

    # Check candidates for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        value = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarityValue = (HashSize**2-value) / HashSize**2
        if similarityValue > threshold:
            AlmostSimilar.append((cpa, cpb, similarityValue))
            
    # Sort and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=False)
    return AlmostSimilar

def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")


# In[ ]:








In [2]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

    
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"C:location").convert("L").resize((HashSize+1,HashSize),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

def FindDuplicates(Directory:str,threshold:float,HashSize:int,bands:int) -> List[Tuple[str,str,float]]:
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in FileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SBand = Signature[i*rows:(i+1)*rows]
            SBandBytes = SBand.tobytes()
            if SBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SBandBytes] = list()
            HashBucketsList[i][SBandBytes].append(fl)

    # Build candidates based on bucket
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBuckets in HashBuckets.values():
            if len(HashBuckets) > 1:
                HashBuckets = sorted(HashBuckets)
                for l in range(len(HashBuckets)):
                    for m in range(l+1, len(HashBuckets)):
                        Candidates.add(tuple([HashBuckets[l],HashBuckets[m]]))

    # Check candidates for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        value = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarityValue = (HashSize**2-value) / HashSize**2
        if similarityValue > threshold:
            AlmostSimilar.append((cpa, cpb, similarityValue))
            
    # Sort and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=False)
    return AlmostSimilar

In [3]:
def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")

In [4]:
def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")

In [5]:
import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

    
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"C:location").convert("L").resize((20+1, 20),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

def FindDuplicates(Directory: str,threshold: float,HashSize:int,bands: int) -> List[Tuple[str, str, float]]:

    #rows:int= int(HashSize**2/Bands)
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in fileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SignatureBand = Signature[i*rows:(i+1)*rows]
            SignatureBandBytes = SignatureBand.tobytes()
            if SignatureBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SignatureBandBytes] = list()
            HashBucketsList[i][SignatureBandBytes].append(fl)

    # Build candidates based on bucket membership
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBucket in HashBuckets.values():
            if len(HashBucket) > 1:
                HashBucket = sorted(HashBucket)
                for i in range(len(HashBucket)):
                    for j in range(i+1, len(HashBucket)):
                        Candidates.add(
                            tuple([HashBucket[i],HashBucket[j]])
                        )

    # Check candidate pairs for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        hd = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarity = (HashSize**2 - hd) / HashSize**2
        if similarity > threshold:
            AlmostSimilar.append((cpa, cpb, similarity))
            
    # Sort near-duplicates by descending similarity and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=True)
    return AlmostSimilar

def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=1051
    bands=48
    try:
        AlmostSimilar = FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")


In [6]:
import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

    
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"Clocation").convert("L").resize((20+1, 20),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

def FindDuplicates(Directory: str,threshold: float,HashSize:int,bands: int) -> List[Tuple[str, str, float]]:

    #rows:int= int(HashSize**2/Bands)
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in fileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SignatureBand = Signature[i*rows:(i+1)*rows]
            SignatureBandBytes = SignatureBand.tobytes()
            if SignatureBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SignatureBandBytes] = list()
            HashBucketsList[i][SignatureBandBytes].append(fl)

    # Build candidates based on bucket membership
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBucket in HashBuckets.values():
            if len(HashBucket) > 1:
                HashBucket = sorted(HashBucket)
                for i in range(len(HashBucket)):
                    for j in range(i+1, len(HashBucket)):
                        Candidates.add(
                            tuple([HashBucket[i],HashBucket[j]])
                        )

    # Check candidate pairs for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        hd = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarity = (HashSize**2 - hd) / HashSize**2
        if similarity > threshold:
            AlmostSimilar.append((cpa, cpb, similarity))
            
    # Sort near-duplicates by descending similarity and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=True)
    return AlmostSimilar

def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=1051
    bands=48
    try:
        AlmostSimilar = FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")


In [7]:



import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

    
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"C:location").convert("L").resize((HashSize+1,HashSize),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

def FindDuplicates(Directory:str,threshold:float,HashSize:int,bands:int) -> List[Tuple[str,str,float]]:
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in FileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SBand = Signature[i*rows:(i+1)*rows]
            SBandBytes = SBand.tobytes()
            if SBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SBandBytes] = list()
            HashBucketsList[i][SBandBytes].append(fl)

    # Build candidates based on bucket
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBuckets in HashBuckets.values():
            if len(HashBuckets) > 1:
                HashBuckets = sorted(HashBuckets)
                for l in range(len(HashBuckets)):
                    for m in range(l+1, len(HashBuckets)):
                        Candidates.add(tuple([HashBuckets[l],HashBuckets[m]]))

    # Check candidates for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        value = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarityValue = (HashSize**2-value) / HashSize**2
        if similarityValue > threshold:
            AlmostSimilar.append((cpa, cpb, similarityValue))
            
    # Sort and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=False)
    return AlmostSimilar

def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")


# In[ ]:





In [8]:
import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

In [9]:
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"C:location").convert("L").resize((HashSize+1,HashSize),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

In [10]:
def FindDuplicates(Directory:str,threshold:float,HashSize:int,bands:int) -> List[Tuple[str,str,float]]:
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in FileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SBand = Signature[i*rows:(i+1)*rows]
            SBandBytes = SBand.tobytes()
            if SBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SBandBytes] = list()
            HashBucketsList[i][SBandBytes].append(fl)

    # Build candidates based on bucket
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBuckets in HashBuckets.values():
            if len(HashBuckets) > 1:
                HashBuckets = sorted(HashBuckets)
                for l in range(len(HashBuckets)):
                    for m in range(l+1, len(HashBuckets)):
                        Candidates.add(tuple([HashBuckets[l],HashBuckets[m]]))

    # Check candidates for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        value = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarityValue = (HashSize**2-value) / HashSize**2
        if similarityValue > threshold:
            AlmostSimilar.append((cpa, cpb, similarityValue))
            
    # Sort and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=False)
    return AlmostSimilar

In [11]:

def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")

In [12]:

def main():

    Directory=r"C:location"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")

In [13]:
import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Tuple
import imagehash
import numpy as n
from PIL import Image

In [14]:
def FindSignature(image:str,HashSize:int)-> n.ndarray:
    
    pilImage = Image.open(r"C:").convert("L").resize((HashSize+1,HashSize),Image.ANTIALIAS)
    dhash = imagehash.dhash(pilImage,HashSize)
    Signature = dhash.hash.flatten()
    pilImage.close()
    return Signature

In [15]:
def FindDuplicates(Directory:str,threshold:float,HashSize:int,bands:int) -> List[Tuple[str,str,float]]:
    rows:int=(HashSize/2)
    Signatures = dict()
    HashBucketsList: List[Dict[str, List[str]]] = [dict() for _ in range(bands)]
    
    #list of candidate files
    FileList = [join(Directory, fd) for fd in listdir(Directory) if isfile(join(Directory, fd))]
    #Check through all files
    for fl in FileList:
        try:
            Signature = FindSignature(fl, HashSize)
        except IOError:
            continue
        #Record Signatures
        Signatures[fl] = n.packbits(Signature)
        
        # Locality Sensitive Hashing
        for i in range(bands):
            SBand = Signature[i*rows:(i+1)*rows]
            SBandBytes = SBand.tobytes()
            if SBandBytes not in HashBucketsList[i]:
                HashBucketsList[i][SBandBytes] = list()
            HashBucketsList[i][SBandBytes].append(fl)

    # Build candidates based on bucket
    Candidates = set()
    for HashBuckets in HashBucketsList:
        for HashBuckets in HashBuckets.values():
            if len(HashBuckets) > 1:
                HashBuckets = sorted(HashBuckets)
                for l in range(len(HashBuckets)):
                    for m in range(l+1, len(HashBuckets)):
                        Candidates.add(tuple([HashBuckets[l],HashBuckets[m]]))

    # Check candidates for similarity
    AlmostSimilar = list()
    for cpa, cpb in Candidates:
        value = sum(n.bitwise_xor(n.unpackbits(Signatures[cpa]),n.unpackbits(Signatures[cpb])))
        similarityValue = (HashSize**2-value) / HashSize**2
        if similarityValue > threshold:
            AlmostSimilar.append((cpa, cpb, similarityValue))
            
    # Sort and return
    AlmostSimilar.sort(key=lambda x:x[2], reverse=False)
    return AlmostSimilar

In [1]:
def main():

    Directory=r"C:"
    threshold=0.7
    HashSize=19
    bands=48
    try:
        AlmostSimilar =FindDuplicates(Directory, threshold, HashSize, bands)
        if AlmostSimilar:
            print(f"Found {len(AlmostSimilar)} near-duplicate images in {Directory} (threshold {threshold:.2%})")
            for a,b,s in AlmostSimilar:
                print(f"{s:.2%} similarity: file 1: {a} - file 2: {b}")
        else:
            print(f"No near-duplicates found in {Directory} (threshold {threshold:.2%})")
    except OSError:
        print(f"Couldn't open input directory {Directory}")