In [None]:
import os
from skimage import metrics
import cv2
import numpy as np

from skimage.metrics import structural_similarity as ssim
import matplotlib.pyplot as plt

## Mean Squared Error

In [None]:
def mse(imageA, imageB):
	# the 'Mean Squared Error' between the two images is the
	# sum of the squared difference between the two images;
	# NOTE: the two images must have the same dimension
	err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
	err /= float(imageA.shape[0] * imageA.shape[1])
	
	# return the MSE, the lower the error, the more "similar"
	# the two images are
	return err

In [None]:
image1 = cv2.imread('datasets/popup_Br_308.jpg')
image2 = cv2.imread('datasets/medium_Br_308.jpg')

In [None]:
image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)

In [None]:
mse(image1, image2)

# Histogram similarity

In [None]:
image1 = cv2.imread('datasets/popup_Br_308.jpg')

In [None]:
hist_img1 = cv2.calcHist([image1], [0, 1, 2], None, [256, 256, 256], [0, 256, 0, 256, 0, 256])
hist_img1[255, 255, 255] = 0 #ignore all white pixels

In [None]:
cv2.normalize(hist_img1, hist_img1, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)

In [None]:
image2 = cv2.imread('datasets/medium_Br_308.jpg')

In [None]:
hist_img2 = cv2.calcHist([image2], [0, 1, 2], None, [256, 256, 256], [0, 256, 0, 256, 0, 256])
hist_img2[255, 255, 255] = 0  #ignore all white pixels
cv2.normalize(hist_img2, hist_img2, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)

In [None]:
metric_val = cv2.compareHist(hist_img1, hist_img2, cv2.HISTCMP_CORREL)
print(f"Similarity Score: ", round(metric_val, 2))

## Structural Similarity Index (SSIM)

In [None]:
image1 = cv2.imread('datasets/popup_Br_308.jpg')

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,10))

ax1.imshow(image1, cmap = plt.cm.gray)
ax2.imshow(image2, cmap = plt.cm.gray)

In [None]:
image2 = cv2.imread('datasets/medium_Br_308.jpg')

In [None]:
image2 = cv2.resize(image2, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)

In [None]:
print(image1.shape, image2.shape)

In [None]:
ssim(image1, image2, win_size=3)

In [None]:
image3 = cv2.imread('datasets/kpss_mohov.jpg')

In [None]:
image3 = cv2.resize(image3, (image1.shape[1], image1.shape[0]), interpolation = cv2.INTER_AREA)

In [None]:
print(image1.shape, image3.shape)

In [None]:
image1_gray = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
image2_gray = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
image3_gray = cv2.cvtColor(image3, cv2.COLOR_BGR2GRAY)

In [None]:
ssim_score = metrics.structural_similarity(image1_gray, image2_gray, full=True)
print(f"SSIM Score: ", round(ssim_score[0], 2))

In [None]:
ssim_score = metrics.structural_similarity(image1_gray, image3_gray, full=True)
print(f"SSIM Score: ", round(ssim_score[0], 2))

## Dense Vector Representations

https://stackoverflow.com/questions/11541154/checking-images-for-similarity-with-opencv

In [1]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the OpenAI CLIP Model
print('Loading CLIP Model...')
model = SentenceTransformer('clip-ViT-B-32')

Loading CLIP Model...


In [3]:
image_names = list(glob.glob('C:/Users/jales/Downloads/pics/*'))
image_names

['C:/Users/jales/Downloads/pics\\%281%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2810%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2813%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2817%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2818%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2819%29.jpg',
 'C:/Users/jales/Downloads/pics\\%282%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2820%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2821%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2823%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2824%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2826%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2827%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2828%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2829%29.jpg',
 'C:/Users/jales/Downloads/pics\\%283%29.jpg',
 'C:/Users/jales/Downloads/pics\\%2831%29.jpg',
 'C:/Users/jales/Downloads/pics\\%285%29.jpg',
 'C:/Users/jales/Downloads/pics\\%287%29.jpg',
 'C:/Users/jales/Downloads/pics\\%288%29.jpg',
 'C:/Users/jales/Downloads/pics\\%289%29.jpg',

In [4]:
img = Image.open('C:/Users/jales/Downloads/pics\\%281%29.jpg')

In [5]:
img._size[0] * img._size[1] > 173426400

False

In [6]:
# Next we compute the embeddings
# To encode an image, you can use the following code:
# from PIL import Image
# encoded_image = model.encode(Image.open(filepath))
image_names = list(glob.glob('C:/Users/jales/Downloads/pics/*.jpg'))
print("Images:", len(image_names))
encoded_image = model.encode([Image.open(filepath) for filepath in image_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)

Images: 984


Batches: 100%|██████████| 8/8 [02:38<00:00, 19.87s/it]


In [7]:
# Now we run the clustering algorithm. This function compares images aganist 
# all other images and returns a list with the pairs that have the highest 
# cosine similarity score
processed_images = util.paraphrase_mining_embeddings(encoded_image)
NUM_SIMILAR_IMAGES = 10

In [8]:
# =================
# NEAR DUPLICATES
# =================
print('Finding near duplicate images...')
# Use a threshold parameter to identify two images as similar. By setting the threshold lower, 
# you will get larger clusters which have less similar images in it. Threshold 0 - 1.00
# A threshold of 1.00 means the two images are exactly the same. Since we are finding near 
# duplicate images, we can set it at 0.99 or any number 0 < X < 1.00.
threshold = 0.99
near_duplicates = [image for image in processed_images if image[0] < threshold]

Finding near duplicate images...


In [9]:
near_duplicates

[[0.9891569018363953, 44, 45],
 [0.9864649772644043, 37, 976],
 [0.9848965406417847, 45, 804],
 [0.9792672395706177, 3, 895],
 [0.9783816933631897, 52, 763],
 [0.967072606086731, 182, 644],
 [0.9669891595840454, 249, 777],
 [0.9668173789978027, 41, 780],
 [0.9661064743995667, 55, 735],
 [0.9605892896652222, 268, 949],
 [0.9598821401596069, 772, 941],
 [0.9555590152740479, 47, 816],
 [0.955280065536499, 578, 731],
 [0.9522665739059448, 668, 741],
 [0.9497537612915039, 579, 761],
 [0.94953453540802, 535, 681],
 [0.9477376937866211, 57, 64],
 [0.941092848777771, 389, 956],
 [0.940240204334259, 56, 241],
 [0.936234712600708, 54, 358],
 [0.9303354024887085, 224, 735],
 [0.9275925159454346, 48, 833],
 [0.9250786304473877, 525, 939],
 [0.9232856631278992, 459, 939],
 [0.9226405620574951, 64, 143],
 [0.9210860729217529, 325, 557],
 [0.9203133583068848, 64, 224],
 [0.9199763536453247, 143, 224],
 [0.9197996258735657, 64, 735],
 [0.9195613265037537, 143, 735],
 [0.918609619140625, 161, 722],
 [0

In [10]:
for score, image_id1, image_id2 in near_duplicates[0:NUM_SIMILAR_IMAGES]:
    print("\nScore: {:.3f}%".format(score * 100))
    print(image_names[image_id1])
    print(image_names[image_id2])


Score: 98.916%
C:/Users/jales/Downloads/pics\462.jpg
C:/Users/jales/Downloads/pics\462_1.jpg

Score: 98.646%
C:/Users/jales/Downloads/pics\4.jpg
C:/Users/jales/Downloads/pics\worksovpost_00036.jpg

Score: 98.490%
C:/Users/jales/Downloads/pics\462_1.jpg
C:/Users/jales/Downloads/pics\sovpolpost_00062.jpg

Score: 97.927%
C:/Users/jales/Downloads/pics\%2817%29.jpg
C:/Users/jales/Downloads/pics\tradesovpost_00017.jpg

Score: 97.838%
C:/Users/jales/Downloads/pics\65_12x18.jpg
C:/Users/jales/Downloads/pics\sovpolpost_00011.jpg

Score: 96.707%
C:/Users/jales/Downloads/pics\ksssrpost_0001.jpg
C:/Users/jales/Downloads/pics\popup_St_027.jpg

Score: 96.699%
C:/Users/jales/Downloads/pics\lenin.jpg
C:/Users/jales/Downloads/pics\sovpolpost_00031.jpg

Score: 96.682%
C:/Users/jales/Downloads/pics\437.jpg
C:/Users/jales/Downloads/pics\sovpolpost_00034.jpg

Score: 96.611%
C:/Users/jales/Downloads/pics\69_24_36.jpg
C:/Users/jales/Downloads/pics\sovmilpost18_50_0008.jpg

Score: 96.059%
C:/Users/jales/Down