In [None]:
# Importing original and optimized FINd algorithms
from FINd import FINDHasher
from myFINd1 import FINDHasherr
from multiprocess import multiprocess

# importing imagehash library
import imagehash

# general import statements
from PIL import Image
import glob
import re
import numpy as np
import pandas as pd
import dataframe_image as dfi
from functools import reduce

In [None]:
# loading profiler
%load_ext line_profiler
%load_ext memory_profiler

# Creating subset of "meme generator" dataset

In [3]:
imgs = glob.glob('das_images/000*.jpg')
len(imgs)

3444

# Original FINd algorithm

In [4]:
# defining image hasher
findHasher = FINDHasher()

In [5]:
# Hashing subset of images
ex1 = findHasher.fromFile(imgs[0])
ex2 = findHasher.fromFile(imgs[1])
ex3 = findHasher.fromFile(imgs[2])

In [6]:
# Comparing image hashes
print(f"Difference between ex1 - ex2 = {ex1-ex2}")
print(f"Difference between ex1 - ex3 = {ex1-ex3}")
print(f"Difference between ex2 - ex3 = {ex2-ex3}")

Difference between ex1 - ex2 = 118
Difference between ex1 - ex3 = 32
Difference between ex2 - ex3 = 128


In [7]:
%lprun -f findHasher.fromFile -f findHasher.fromImage -f findHasher.fillFloatLumaFromBufferImage -f findHasher.findHash256FromFloatLuma -f findHasher.decimateFloat -f findHasher.boxFilter findHasher.fromFile(imgs[0])

Timer unit: 1e-06 s

Total time: 4.19272 s
File: /Users/mads/Desktop/SDS/DAS/Summative/FINd.py
Function: fromFile at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                           	def fromFile(self, filepath):
    40         1          6.0      6.0      0.0  		img = None
    41         1          1.0      1.0      0.0  		try:
    42         1        630.0    630.0      0.0  			img = Image.open(filepath)
    43                                           		except IOError as e:
    44                                           			raise e
    45         1    4192087.0 4192087.0    100.0  		return self.fromImage(img)

Total time: 4.19133 s
File: /Users/mads/Desktop/SDS/DAS/Summative/FINd.py
Function: fromImage at line 47

Line #      Hits         Time  Per Hit   % Time  Line Contents
    47                                           	def fromImage(self,img):
    48         1          1.0      1.0      0.0  		try:
    49         1      

In [8]:
%memit [findHasher.fromFile(i) for i in imgs[:100]]

peak memory: 127.33 MiB, increment: 4.71 MiB


In [9]:
%timeit findHasher.fromFile(imgs[0])

357 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit [findHasher.fromFile(i) for i in imgs[:100]]

36.1 s ± 349 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Single-process optimization

In [11]:
# defining image hasher
findHasher_new = FINDHasherr()

In [12]:
# Hashing subset of images
ex1_new = findHasher_new.fromFile(imgs[0])
ex2_new = findHasher_new.fromFile(imgs[1])
ex3_new = findHasher_new.fromFile(imgs[2])

In [13]:
# Comparing image hashes
print(f"Difference between ex1 - ex2 = {ex1_new-ex2_new}")
assert ex1_new-ex2_new == ex1-ex2, "Difference in image hashes example 1 and 2 do not match with original output"

print(f"Difference between ex1 - ex3 = {ex1_new-ex3_new}")
assert ex1_new-ex3_new == ex1-ex3, "Difference in image hashes for example 1 and 3 do not match with original output"

print(f"Difference between ex2 - ex3 = {ex2_new-ex3_new}")
assert ex2_new-ex3_new == ex2-ex3, "Difference in image hashes for example 2 and 3 do not match with original output"

assert ex1_new == ex1, "Immage hashes for image example 1 do not match"
assert ex2_new == ex2, "Immage hashes for image example 2 do not match"
assert ex3_new == ex3, "Immage hashes for image example 3 do not match"

Difference between ex1 - ex2 = 118
Difference between ex1 - ex3 = 32
Difference between ex2 - ex3 = 128


In [14]:
%lprun -f findHasher_new.fromFile -f findHasher_new.fromImage -f findHasher_new.fillFloatLumaFromBufferImage -f findHasher_new.findHash256FromFloatLuma -f findHasher_new.boxFilter findHasher_new.fromFile(imgs[0])

Timer unit: 1e-06 s

Total time: 0.338709 s
File: /Users/mads/Desktop/SDS/DAS/Summative/myFINd1.py
Function: fromFile at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                           	def fromFile(self, filepath):
    40         1          1.0      1.0      0.0  		img = None
    41         1          1.0      1.0      0.0  		try:
    42         1        458.0    458.0      0.1  			img = Image.open(filepath)
    43                                           		except IOError as e:
    44                                           			raise e
    45         1     338249.0 338249.0     99.9  		return self.fromImage(img)

Total time: 0.33774 s
File: /Users/mads/Desktop/SDS/DAS/Summative/myFINd1.py
Function: fromImage at line 47

Line #      Hits         Time  Per Hit   % Time  Line Contents
    47                                           	def fromImage(self,img):
    48         1          0.0      0.0      0.0  		try:
    49          

In [15]:
%memit [findHasher_new.fromFile(i) for i in imgs[:100]]

peak memory: 142.47 MiB, increment: 6.79 MiB


In [16]:
%timeit findHasher_new.fromFile(imgs[0])

34.3 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
%timeit [findHasher_new.fromFile(i) for i in imgs[:10]]

343 ms ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%timeit [findHasher_new.fromFile(i) for i in imgs[:100]]

3.2 s ± 116 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit [findHasher_new.fromFile(i) for i in imgs]

1min 47s ± 3.43 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Multi-process optimization

In [20]:
# defining image hasher
nh = multiprocess(filepath=imgs[:100])

In [21]:
# Hashing subset of images
ex1_multi = nh[imgs[0]]
ex2_multi = nh[imgs[1]]
ex3_multi = nh[imgs[2]]

In [22]:
# Comparing image hashes
print(f"Difference between ex1 - ex2 = {ex1_multi-ex2_multi}")
assert ex1_multi-ex2_multi == ex1-ex2, "Difference in image hashes example 1 and 2 do not match with original output"

print(f"Difference between ex1 - ex3 = {ex1_multi-ex3_multi}")
assert ex1_multi-ex3_multi == ex1-ex3, "Difference in image hashes for example 1 and 3 do not match with original output"

print(f"Difference between ex2 - ex3 = {ex2_multi-ex3_multi}")
assert ex2_multi-ex3_multi == ex2-ex3, "Difference in image hashes for example 2 and 3 do not match with original output"

assert ex1_multi == ex1, "Immage hashes for image example 1 do not match"
assert ex2_multi == ex2, "Immage hashes for image example 2 do not match"
assert ex3_multi == ex3, "Immage hashes for image example 3 do not match"

Difference between ex1 - ex2 = 118
Difference between ex1 - ex3 = 32
Difference between ex2 - ex3 = 128


In [23]:
%memit multiprocess(imgs[:100])

peak memory: 188.55 MiB, increment: 0.07 MiB


In [24]:
%timeit multiprocess(imgs[:10])

572 ms ± 84.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%timeit multiprocess(imgs[:100])

1.53 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%timeit multiprocess(imgs)

34.2 s ± 813 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Imagehash library

In [27]:
# profiling average hashing
%timeit [imagehash.average_hash(Image.open(i)) for i in imgs]
%memit [imagehash.average_hash(Image.open(i)) for i in imgs]

3.45 s ± 44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 195.34 MiB, increment: 0.00 MiB


In [28]:
# profiling phashing
%timeit [imagehash.phash(Image.open(i)) for i in imgs]
%memit [imagehash.phash(Image.open(i)) for i in imgs]

3.87 s ± 120 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 195.75 MiB, increment: 0.00 MiB


# Algorithm classification

In [29]:
# creating image hashing result dictionaries for each algorithm

# for multi-process optimized FINd algorithm
nh = multiprocess(imgs)

# for average hashing form imagehash library
ahash = {}
for i in imgs:
    ahash[i] = imagehash.average_hash(Image.open(i))
    
# for phash form imagehash library
phash = {}
for i in imgs:
    phash[i] = imagehash.phash(Image.open(i))

In [50]:
# functions for running accuracy test

'''This section creates a representative image for each image family and
attaches the respective image hash created by the respective image hashing
algorithm. Additionally, it creates a dictionary of the true number of
images in each image family. It uses the representatie image to cateogrize
every other image into certain groups by substracting the hashes from the
hash of the representative image. The results of this operations are added
to a dictionary again. This dictionary contains all predicted images per
group in addition to their filename, which contains the "true" classification.
From this information, the image predictions will be classified as per
their "true" classifications and accuracy values calculated. The results
are put into a dataframe for better overview. The overall section takes
two arguments, the algorithm used (nh = multi-process optimization of
FINd algorithm; ahash = average_hash from imagehash library; phash = 
phash from image library) and the threshold set (int).'''

def rep_img(algo):
    '''creating a dictionary of representative image filenames
    and image hashes'''
    rep_dic = {}
    img_list = []
    
    for i in range(10):
        img_list.append(glob.glob(f'das_images/000{i}*.jpg'))
        rep_dic['rep{0}'.format(i)] = list([img_list[i][0]])

    for i in rep_dic.keys():
        rep_dic[f'{i}'].append(algo[rep_dic[f'{i}'][0]])
        
    rep_list = list(rep_dic.keys())
    
    return rep_dic, rep_list


def img_overview(rep_dic,rep_list):
    '''creating a dictionary with the true number of images
    per image family'''
    group_list = []

    for i in imgs:
        num = re.compile('(\d)_')
        group_list.append(num.findall(i))

    x = np.array(group_list)
    x = x.flatten().tolist()
    x = [int(i) for i in x] 

    group_temp_dic = dict((val,x.count(val)) for val in set(range(10)))
    group_dic = dict((rep_list[i],group_temp_dic[i]) for i in set(range(10)))
    
    return group_dic


def img_pred(rep_dic,algo,threshold):
    '''creating a dictionary with the predicted image names per group
    with certain threshold per image hashing algorithm'''
    result_dict = {}
    
    for i in rep_dic.items():
        result_dict["{0}".format(i[0])] = []
        
        for j in algo.items():
            if j[1]-i[1][1] < threshold:
                result_dict[i[0]].append(j[0])
                
    return result_dict  
            

def img_values(result_dict,group_dic,rep_list):
    '''Creating dictionary with prediction values per image group'''
    performance_dict = {}
    for i in result_dict.items():
        temp_list = []

        for j in i[1]:
            num = re.compile('(\d)_')
            temp_list.append(num.findall(j))

        temp_arr = np.array(temp_list)
        temp_list_int = temp_arr.flatten().tolist()
        temp_list_int = [int(i) for i in temp_list_int]

        temp_dic = dict((val,temp_list_int.count(val)) for val in set(range(10)))
        temp_dic = dict((rep_list[i],temp_dic[i]) for i in set(range(10)))

        temp_predicted = 0 # calculating the total number of predicted images for each group
        for k in rep_list:
            temp_predicted += temp_dic[k]

        temp_total = group_dic[f"{i[0]}"]

        temp_correct_predicted = temp_dic[f"{i[0]}"]
        temp_incorrect_predicted = temp_predicted - temp_correct_predicted
        temp_incorrect_not_predicted = temp_total - temp_correct_predicted
        temp_correct_not_predicted = len(imgs) - temp_total - temp_incorrect_predicted

        performance_dict[f"{i[0]}"] = []
        performance_dict[f"{i[0]}"].append(temp_total)
        performance_dict[f"{i[0]}"].append(temp_correct_predicted)
        performance_dict[f"{i[0]}"].append(temp_incorrect_predicted)
        performance_dict[f"{i[0]}"].append(temp_correct_not_predicted)
        performance_dict[f"{i[0]}"].append(temp_incorrect_not_predicted)
        
    return performance_dict
    
    
def df_create(performance_dict):
    '''generating dataframe from prediction results'''
    performance_df = pd.DataFrame(performance_dict).T
    performance_df.columns = ['Total',
                              'Correct Predicted',
                              'Incorrect Predicted',
                              'Correct NOT Predicted',
                              'Incorrect NOT Predicted']
    
    performance_df['Accuracy'] = (performance_df['Correct Predicted'] 
                                  + performance_df['Correct NOT Predicted'])/(performance_df['Correct Predicted'] 
                                                                              + performance_df['Correct NOT Predicted'] 
                                                                              + performance_df['Incorrect Predicted'] 
                                                                              + performance_df['Incorrect NOT Predicted'])
    
    performance_df['True Positive Rate'] = performance_df['Correct Predicted']/performance_df['Total']
    
    performance_df['True Negative Rate'] = performance_df['Correct NOT Predicted']/(performance_df['Incorrect Predicted'] 
                                                                                    + performance_df['Correct NOT Predicted'])
       
    performance_df['Precision'] = performance_df['Correct Predicted']/(performance_df['Correct Predicted']
                                                                       + performance_df['Incorrect Predicted'])
    
    performance_df['Prevalence'] = performance_df['Total']/(performance_df['Correct Predicted'] 
                                                            + performance_df['Correct NOT Predicted'] 
                                                            + performance_df['Incorrect Predicted'] 
                                                            + performance_df['Incorrect NOT Predicted'])
    
    performance_df = performance_df.round(3)
    performance_list = performance_df.columns.tolist()
    performance_list = performance_list[5:]
    
    return performance_df, performance_list


def accuracy_val(performance_df,performance_list):
    '''creating arrays for each performance metric 
    (plus prevalence for weighting)'''
    prevalence_arr = np.array(performance_df.loc[:,performance_list[-1]].tolist())
    acc_arr = np.array(performance_df.loc[:,performance_list[0]].tolist())
    TPR_arr = np.array(performance_df.loc[:,performance_list[1]].tolist())
    TNR_arr = np.array(performance_df.loc[:,performance_list[2]].tolist())
    pre_arr = np.array(performance_df.loc[:,performance_list[3]].tolist())

    accuracy = sum(acc_arr*prevalence_arr)
    TPR = sum(TPR_arr*prevalence_arr)
    TNR = sum(TNR_arr*prevalence_arr)
    precision = sum(pre_arr*prevalence_arr)

    overview_df = pd.DataFrame([accuracy,TPR,TNR,precision],
                               index=(performance_list[:-1]),columns=(['']))
    
    return overview_df

def main(algo,threshold):
    '''main statement to run the overvall accuracy test'''
    rep_dic,rep_list = rep_img(algo)
    group_dic = img_overview(rep_dic,rep_list)
    result_dict = img_pred(rep_dic,algo,threshold)
    performance_dict = img_values(result_dict,group_dic,rep_list)
    performance_df,performance_list = df_create(performance_dict)
    overview_df = accuracy_val(performance_df,performance_list)
    return overview_df,performance_df

In [51]:
# running accuracy test for multi-process optimization of FINd algorithm
algo = nh
threshold = 90

nh_overview_df,nh_performance_df = main(algo,threshold)

display(nh_performance_df)
display(nh_overview_df)

Unnamed: 0,Total,Correct Predicted,Incorrect Predicted,Correct NOT Predicted,Incorrect NOT Predicted,Accuracy,True Positive Rate,True Negative Rate,Precision,Prevalence
rep0,27,27,0,3417,0,1.0,1.0,1.0,1.0,0.008
rep1,48,48,0,3396,0,1.0,1.0,1.0,1.0,0.014
rep2,663,628,0,2781,35,0.99,0.947,1.0,1.0,0.193
rep3,136,136,0,3308,0,1.0,1.0,1.0,1.0,0.039
rep4,612,587,0,2832,25,0.993,0.959,1.0,1.0,0.178
rep5,365,364,0,3079,1,1.0,0.997,1.0,1.0,0.106
rep6,531,521,0,2913,10,0.997,0.981,1.0,1.0,0.154
rep7,469,455,0,2975,14,0.996,0.97,1.0,1.0,0.136
rep8,178,178,0,3266,0,1.0,1.0,1.0,1.0,0.052
rep9,415,409,2,3027,6,0.998,0.986,0.999,0.995,0.12


Unnamed: 0,Unnamed: 1
Accuracy,0.995578
True Positive Rate,0.973469
True Negative Rate,0.99988
Precision,0.9994


In [52]:
# running accuracy test for multi-process optimization of FINd algorithm
algo = ahash
threshold = 13

a_overview_df,a_performance_df = main(algo,threshold)

display(a_performance_df)
display(a_overview_df)

Unnamed: 0,Total,Correct Predicted,Incorrect Predicted,Correct NOT Predicted,Incorrect NOT Predicted,Accuracy,True Positive Rate,True Negative Rate,Precision,Prevalence
rep0,27,27,0,3417,0,1.0,1.0,1.0,1.0,0.008
rep1,48,48,0,3396,0,1.0,1.0,1.0,1.0,0.014
rep2,663,626,122,2659,37,0.954,0.944,0.956,0.837,0.193
rep3,136,135,1,3307,1,0.999,0.993,1.0,0.993,0.039
rep4,612,595,48,2784,17,0.981,0.972,0.983,0.925,0.178
rep5,365,365,0,3079,0,1.0,1.0,1.0,1.0,0.106
rep6,531,519,5,2908,12,0.995,0.977,0.998,0.99,0.154
rep7,469,364,0,2975,105,0.97,0.776,1.0,1.0,0.136
rep8,178,172,13,3253,6,0.994,0.966,0.996,0.93,0.052
rep9,415,351,1,3028,64,0.981,0.846,1.0,0.997,0.12


Unnamed: 0,Unnamed: 1
Accuracy,0.980259
True Positive Rate,0.929681
True Negative Rate,0.987966
Precision,0.949378


In [53]:
# running accuracy test for multi-process optimization of FINd algorithm
algo = phash
threshold = 20

p_overview_df,p_performance_df = main(algo,threshold)

display(p_performance_df)
display(p_overview_df)

Unnamed: 0,Total,Correct Predicted,Incorrect Predicted,Correct NOT Predicted,Incorrect NOT Predicted,Accuracy,True Positive Rate,True Negative Rate,Precision,Prevalence
rep0,27,27,0,3417,0,1.0,1.0,1.0,1.0,0.008
rep1,48,48,0,3396,0,1.0,1.0,1.0,1.0,0.014
rep2,663,637,4,2777,26,0.991,0.961,0.999,0.994,0.193
rep3,136,136,2,3306,0,0.999,1.0,0.999,0.986,0.039
rep4,612,527,10,2822,85,0.972,0.861,0.996,0.981,0.178
rep5,365,363,0,3079,2,0.999,0.995,1.0,1.0,0.106
rep6,531,520,0,2913,11,0.997,0.979,1.0,1.0,0.154
rep7,469,422,3,2972,47,0.985,0.9,0.999,0.993,0.136
rep8,178,177,0,3266,1,1.0,0.994,1.0,1.0,0.052
rep9,415,410,51,2978,5,0.984,0.988,0.983,0.889,0.12


Unnamed: 0,Unnamed: 1
Accuracy,0.988712
True Positive Rate,0.948615
True Negative Rate,0.99688
Precision,0.980642


In [54]:
# merging *_overview_df of all algorithms
overview_dfs = [nh_overview_df,a_overview_df,p_overview_df]

overview_df = reduce(lambda left,right: pd.merge(left,right,right_index=True,left_index=True), overview_dfs)

overview_df = overview_df.rename(columns={'_x':'multi-process','_y':'ahash','':'phash'})
overview_df = overview_df.round(4)

overview_df

# print(overview_df.to_latex())

Unnamed: 0,multi-process,ahash,phash
Accuracy,0.9956,0.9803,0.9887
True Positive Rate,0.9735,0.9297,0.9486
True Negative Rate,0.9999,0.988,0.9969
Precision,0.9994,0.9494,0.9806
