In [16]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [11]:
import cv2 #For image processing
import numpy as np #For arrasys and csv exporting
from pathlib import Path #for accessing all files in the directory
from tqdm import tqdm
import pandas as pd

In [28]:
"""
This function compares the similarity of a key image to all other images in the file directory.
It outputs a csv-file of of the filenames and the distance in color distributions measured with the chi-square method.

Inputs:
path =  the directory path of all images

key_image_name = the filename of the key image. It should located in the same directory as the other images.
"""

def compare_colors(path,key_image_name):
    filename = [] #The list of image file names
    distance = [] #The list of chi-squared distances to the key image
    image1 = cv2.imread(f"{path}{key_image_name}")  #read keay image
    hist1 = cv2.calcHist([image1], [0,1,2],None,[8,8,8],[0,256,0,256,0,256]) #calculate color distributions of key image
    hist1 = cv2.normalize(hist1,hist1,0,255,cv2.NORM_MINMAX) #normalize values for optimal comparison
    
    for file in tqdm(sorted(Path(path).glob("*.jpg"))): #for each image in the sorted directory:
        file = str(file)  #convert the filename to string
        filename.append(file.split(sep = "/")[-1]) #append the filename without the path
        image2 = cv2.imread(file) #read the file
        hist2 = cv2.calcHist([image2], [0,1,2],None,[8,8,8],[0,256,0,256,0,256]) #calculate color distribution
        hist2 = cv2.normalize(hist2,hist2,0,255,cv2.NORM_MINMAX) #Normalize
        calculated_distance = cv2.compareHist(hist1,hist2,cv2.HISTCMP_CHISQR)  #Calculate distance
        calculated_distance = round(calculated_distance,2)  #Round to two decimals
        distance.append(calculated_distance)  #append distance to list
        
        
    metadata = np.array((filename,distance)) #create array
    metadata = np.column_stack(metadata) #flip array to long format
    metadata=metadata[np.all(metadata != key_image_name, axis=1)] #remove the entry of the key image
    
    split_key_name = key_image_name.split(".")[0] #Get the image name without the file format

    np.savetxt(#save csv
        f"{path}{split_key_name}_distance_data.csv",#filename
        metadata, #array
        delimiter=',',#comma separeted
        header="filename,distance", #column names
        fmt='%s', #We not sure about this... We get an error without. It is something with the encoding of the numbers or something...
        comments = "")#this is to remove the hashtag from the header

In [29]:
compare_colors("../data/raw/training/training/Matisse/","214162.jpg") #Test with the first image in the directory


100%|██████████| 399/399 [01:35<00:00,  4.19it/s]


In [24]:
test = pd.read_csv("../data/raw/training/training/Matisse/214162_distance_data.csv")

In [25]:
test.sort_values("distance").head(60)

Unnamed: 0,filename,distance
169,214336.jpg,3999.71
186,290269.jpg,16236.06
393,9223372032559840825.jpg,19403.2
322,9223372032559821273.jpg,19442.69
183,290260.jpg,20720.77
190,311530.jpg,30811.07
305,9223372032559821216.jpg,31114.62
391,9223372032559840819.jpg,42310.99
188,290272.jpg,55130.39
10,213919.jpg,58165.74
