<a href="https://colab.research.google.com/github/GrindelfP/3-crypting-algorithms/blob/main/jupyter/cosine_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cosine distance for identifiing simmilar (not identical) images

* [Habr link](https://habr.com/ru/articles/664298/) to original code and artickle.

In [None]:
# -*- coding: utf-8 -*-
import sys
import os
import time
import numpy as np
import keras
from keras.preprocessing import image as image_utils
import json
from typing import Final

PHOTOS_DIR: Final[str] = "/content/drive/MyDrive/photos-for-distances"
OUTPUT_FILE: Final[str] = "/content/drive/MyDrive/outputs/cosine_distance/image_similarities.json"
ERROR_FILE: Final[str] = "/content/drive/MyDrive/outputs/cosine_distance/error_processing.json"

# Function to parse image files
class DATA():
    def __init__(self):
        self.files = []
    def parseIMG(self, dir_name):
        path = f"{dir_name}/"
        for r, d, f in os.walk(path):
            for file in f:
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.files.append(os.path.join(r, file))

# Function to convert an image to a deep feature vector
def deep_vector(img_path, model, preprocess):
    try:
        img = image_utils.load_img(img_path, target_size=(224, 224))
        img_array = image_utils.img_to_array(img)
        expanded_img_array = np.expand_dims(img_array, axis=0)
        processed_img = preprocess(expanded_img_array)
        preds = model.predict(processed_img)
        return preds
    except Exception as e:
        print(f"Error processing image {img_path}: {e}")
        return None

# Function to calculate cosine similarity between two vectors
def similarity(vector1, vector2):
    if vector1 is None or vector2 is None:
        return None
    norm_v1 = np.linalg.norm(vector1, axis=1, keepdims=True)
    norm_v2 = np.linalg.norm(vector2.T, axis=0, keepdims=True)
    if np.any(norm_v1 == 0) or np.any(norm_v2 == 0):
        return np.array([[0.0]])  # Handle zero norm to avoid division by zero
    return np.dot(vector1, vector2.T) / (norm_v1 * norm_v2)

if __name__ == '__main__':
    # Initialize the data parser
    data_parser = DATA()
    data_parser.parseIMG(PHOTOS_DIR)

    # Load the pre-trained VGG16 model
    model = keras.applications.vgg16.VGG16(include_top=False,
                                           weights='imagenet',
                                           input_tensor=None,
                                           input_shape=None,
                                           pooling='max')
    # Get the preprocessing function for VGG16
    preprocess = keras.applications.vgg16.preprocess_input

    image_vectors = {}
    error_list = []

    # Generate deep feature vectors for all images
    print("Generating image feature vectors...")
    for img_path in data_parser.files:
        vector = deep_vector(img_path, model, preprocess)
        if vector is not None:
            image_vectors[os.path.basename(img_path)] = vector
        else:
            error_list.append(img_path)
    print("Feature vector generation complete.")

    # Calculate and output cosine similarities
    print("\nCalculating and outputting cosine similarities:")
    similarities_data = []
    image_names = list(image_vectors.keys())
    num_images = len(image_names)

    for i in range(num_images):
        name1 = image_names[i]
        vector1 = image_vectors[name1]
        for j in range(num_images):
            name2 = image_names[j]
            vector2 = image_vectors[name2]
            similarity_score = similarity(vector1, vector2)
            if similarity_score is not None:
                similarity_value = float(similarity_score[0][0])
                print(f"{name1}, {name2}, {similarity_value:.6f}")
                similarities_data.append({"image1": name1, "image2": name2, "similarity": similarity_value})
        print()  # Add an empty line after each group of comparisons

    # Save the similarity data to a JSON file
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(similarities_data, f, indent=4)
    print("\nSimilarities saved to image_similarities.json")

    # Save the list of errors to a JSON file
    if error_list:
        with open(ERROR_FILE, 'w') as f:
            json.dump(error_list, f, indent=4)
        print("Errors during processing saved to error_processing.json")

Generating image feature vectors...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 688ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 514ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 704ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 935ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 925ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 740ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 514ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489ms/step
Feature vector generation complete.

Calculating and outputting cosine similarities:
another.jpeg, another.jpeg, 1.000000
another.jpeg, another-page.jpeg, 0.681915
another.jpeg, shadow-rotated.jpeg, 0.652235
another.jpeg, shadow-pen.jpeg, 0.615061
another.jpeg, shadow-turn.jpeg, 0.619899
another.jpeg, shadow.jpeg, 0.631663
another.jpeg, side1.jpeg, 0.582283
another.jpeg, side2.