In [None]:
import os
from skimage import metrics
import cv2
import numpy as np

from skimage.metrics import structural_similarity as ssim
import matplotlib.pyplot as plt

## Mean Squared Error

In [None]:
def mse(imageA, imageB):
	# the 'Mean Squared Error' between the two images is the
	# sum of the squared difference between the two images;
	# NOTE: the two images must have the same dimension
	err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
	err /= float(imageA.shape[0] * imageA.shape[1])
	
	# return the MSE, the lower the error, the more "similar"
	# the two images are
	return err

In [None]:
image1 = cv2.imread('datasets/popup_Br_308.jpg')
image2 = cv2.imread('datasets/medium_Br_308.jpg')

In [None]:
image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)

In [None]:
mse(image1, image2)

# Histogram similarity

In [None]:
image1 = cv2.imread('datasets/popup_Br_308.jpg')

In [None]:
hist_img1 = cv2.calcHist([image1], [0, 1, 2], None, [256, 256, 256], [0, 256, 0, 256, 0, 256])
hist_img1[255, 255, 255] = 0 #ignore all white pixels

In [None]:
cv2.normalize(hist_img1, hist_img1, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)

In [None]:
image2 = cv2.imread('datasets/medium_Br_308.jpg')

In [None]:
hist_img2 = cv2.calcHist([image2], [0, 1, 2], None, [256, 256, 256], [0, 256, 0, 256, 0, 256])
hist_img2[255, 255, 255] = 0  #ignore all white pixels
cv2.normalize(hist_img2, hist_img2, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)

In [None]:
metric_val = cv2.compareHist(hist_img1, hist_img2, cv2.HISTCMP_CORREL)
print(f"Similarity Score: ", round(metric_val, 2))

## Structural Similarity Index (SSIM)

In [None]:
image1 = cv2.imread('datasets/popup_Br_308.jpg')

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,10))

ax1.imshow(image1, cmap = plt.cm.gray)
ax2.imshow(image2, cmap = plt.cm.gray)

In [None]:
image2 = cv2.imread('datasets/medium_Br_308.jpg')

In [None]:
image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)

In [None]:
print(image1.shape, image2.shape)

In [None]:
ssim(image1, image2, win_size=3)

In [None]:
image3 = cv2.imread('datasets/kpss_mohov.jpg')

In [None]:
image3 = cv2.resize(image3, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)

In [None]:
print(image1.shape, image3.shape)

In [None]:
image1_gray = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
image2_gray = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
image3_gray = cv2.cvtColor(image3, cv2.COLOR_BGR2GRAY)

In [None]:
ssim_score = metrics.structural_similarity(image1_gray, image2_gray, full=True)
print(f"SSIM Score: ", round(ssim_score[0], 2))

In [None]:
ssim_score = metrics.structural_similarity(image1_gray, image3_gray, full=True)
print(f"SSIM Score: ", round(ssim_score[0], 2))

## Dense Vector Representations

https://stackoverflow.com/questions/11541154/checking-images-for-similarity-with-opencv

In [1]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the OpenAI CLIP Model
print('Loading CLIP Model...')
model = SentenceTransformer('clip-ViT-B-32')

Loading CLIP Model...


In [3]:
# Next we compute the embeddings
# To encode an image, you can use the following code:
# from PIL import Image
# encoded_image = model.encode(Image.open(filepath))
image_names = list(glob.glob('./datasets/*.jpg'))
print("Images:", len(image_names))
encoded_image = model.encode([Image.open(filepath) for filepath in image_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)

Images: 5


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.56it/s]


In [4]:
# Now we run the clustering algorithm. This function compares images aganist 
# all other images and returns a list with the pairs that have the highest 
# cosine similarity score
processed_images = util.paraphrase_mining_embeddings(encoded_image)
NUM_SIMILAR_IMAGES = 10 

In [5]:
# =================
# DUPLICATES
# =================
print('Finding duplicate images...')
# Filter list for duplicates. Results are triplets (score, image_id1, image_id2) and is scorted in decreasing order
# A duplicate image will have a score of 1.00
# It may be 0.9999 due to lossy image compression (.jpg)
duplicates = [image for image in processed_images if image[0] >= 0.999]

Finding duplicate images...


In [6]:
# Output the top X duplicate images
for score, image_id1, image_id2 in duplicates[0:NUM_SIMILAR_IMAGES]:
    print("\nScore: {:.3f}%".format(score * 100))
    print(image_names[image_id1])
    print(image_names[image_id2])

In [8]:
# =================
# NEAR DUPLICATES
# =================
print('Finding near duplicate images...')
# Use a threshold parameter to identify two images as similar. By setting the threshold lower, 
# you will get larger clusters which have less similar images in it. Threshold 0 - 1.00
# A threshold of 1.00 means the two images are exactly the same. Since we are finding near 
# duplicate images, we can set it at 0.99 or any number 0 < X < 1.00.
threshold = 0.99
near_duplicates = [image for image in processed_images if image[0] < threshold]

Finding near duplicate images...


In [9]:
near_duplicates

[[0.9213765859603882, 1, 4],
 [0.8219942450523376, 0, 3],
 [0.7337963581085205, 1, 2],
 [0.6786683797836304, 0, 4],
 [0.6598712801933289, 2, 4],
 [0.6223414540290833, 2, 3],
 [0.615389883518219, 0, 1],
 [0.6018456816673279, 1, 3],
 [0.577682375907898, 3, 4],
 [0.5667539238929749, 0, 2]]

In [10]:
for score, image_id1, image_id2 in near_duplicates[0:NUM_SIMILAR_IMAGES]:
    print("\nScore: {:.3f}%".format(score * 100))
    print(image_names[image_id1])
    print(image_names[image_id2])


Score: 92.138%
./datasets\medium_Br_308.jpg
./datasets\popup_Br_308.jpg

Score: 82.199%
./datasets\kpss_mohov.jpg
./datasets\medium_kpss_mohov.jpg

Score: 73.380%
./datasets\medium_Br_308.jpg
./datasets\medium_earth_palms.jpg

Score: 67.867%
./datasets\kpss_mohov.jpg
./datasets\popup_Br_308.jpg

Score: 65.987%
./datasets\medium_earth_palms.jpg
./datasets\popup_Br_308.jpg

Score: 62.234%
./datasets\medium_earth_palms.jpg
./datasets\medium_kpss_mohov.jpg

Score: 61.539%
./datasets\kpss_mohov.jpg
./datasets\medium_Br_308.jpg

Score: 60.185%
./datasets\medium_Br_308.jpg
./datasets\medium_kpss_mohov.jpg

Score: 57.768%
./datasets\medium_kpss_mohov.jpg
./datasets\popup_Br_308.jpg

Score: 56.675%
./datasets\kpss_mohov.jpg
./datasets\medium_earth_palms.jpg
