In [44]:
import os
import re
import shutil
import logging
import numpy as np
from pathlib import Path
from PIL import Image
from torch import no_grad
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.manifold import TSNE
from typing import Any, Literal, Pattern
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoImageProcessor, AutoModel

In [45]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DirectoryUtil:
    """Utility class for handling directories."""

    @staticmethod
    def directory_exists(path: str) -> bool:
        """
        Checks if the specified directory exists.
        
        Args:
            path (str): The directory path.
        
        Returns:
            bool: True if the directory exists, False otherwise.
        """
        return os.path.isdir(s=path)

    @staticmethod
    def create_directory(path: str) -> None:
        """
        Creates a directory at the specified path.
        If the directory already exists, no exception is raised.
        
        Args:
            path (str): The directory path.
        """
        try:
            os.makedirs(name=path, exist_ok=True)
            logging.info(msg=f"Directory created: {path}")
        except Exception as e:
            logging.info(msg=f"Error creating directory {path}: {e}")

    @staticmethod
    def ensure_directory(path: str) -> str:
        """
        Ensures that the directory exists. If it does not, the directory is created.
        
        Args:
            path (str): The directory path.

        Return:
            str: Directory path.
        """
        if not DirectoryUtil.directory_exists(path):
            DirectoryUtil.create_directory(path)
            return path
        else:
            logging.info(msg=f"Directory already exists: {path}")
            return path

    @staticmethod
    def find_downloaded_dataset(datasets_path: str, project_id: str, version_number: int) -> str:
        # logging.info(f"Directories in '{datasets_path}': {os.listdir(datasets_path)}")

        expected_name = f"{project_id}-{version_number}"
        pattern: Pattern[str] = re.compile(re.escape(pattern=expected_name), re.IGNORECASE)

        for folder in os.listdir(path=datasets_path):
            folder_path: str = os.path.join(datasets_path, folder)

            # logging.info(f"Comparing: '{folder}' vs '{expected_name}'")

            if os.path.isdir(s=folder_path) and pattern.search(string=folder):
                return folder_path 

        logging.error(f"Dataset not found for: '{project_id}-{version_number}' in '{datasets_path}'.")
        raise FileNotFoundError(f"Dataset not found for: '{project_id}-{version_number}' in '{datasets_path}'.")

In [46]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Dino:
    def __init__(self, model_name: str)-> None:
        self.model_name = model_name
        self.processor = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path=self.model_name)
        self.model = AutoModel.from_pretrained(pretrained_model_name_or_path=self.model_name)


    def extract_features(self, image_path: str) -> list[Any]:
        image = Image.open(fp=image_path).convert(mode="RGB")
        inputs = self.processor(images=image, return_tensors="pt")
        
        with no_grad():
            outputs = self.model(**inputs)
            features = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
        
        return features

    def get_features(self, images_path: Path) -> list[Any]:
        features_list: list[Any] = []

        all_images_paths: list[Path] = [file for file in images_path.iterdir()]

        for image_path in all_images_paths:
            try:
                features = self.extract_features(image_path=str(image_path))
                features_list.append(features)
            except FileNotFoundError as e:
                logging.error(msg=e)

        flattened_features:list[Any] = [feature.flatten() for feature in features_list]
        return flattened_features

    def get_similarity(self, features)-> Any:
        return cosine_similarity(X=features)    

    def filter_similar_images(self, features, similarity_matrix, threshold=0.995):
        n :int = len(features)
        selected_indices: list[Any] = []
        
        for i in range(n):
            keep = True
            for j in selected_indices:
                print(similarity_matrix[i, j])
                if similarity_matrix[i, j] > threshold:
                    keep = False
                    break
            if keep:
                selected_indices.append(i)
        
        return selected_indices

    def save_filtered_images(self, where_to_save: Path, filtered_images_path: list[Path]) -> None:
        for image_path in filtered_images_path:
            shutil.copy(src=image_path, dst=where_to_save)
            logging.info(msg=f"Image: '{image_path.name}' has been copied to '{where_to_save}'.")

    def filter_images(self, images_path: Path) -> None:
        features = self.get_features(images_path=images_path)
        similarity_matrix = self.get_similarity(features=features)
        selected_indices = self.filter_similar_images(features=features, similarity_matrix=similarity_matrix)
        
        filtered_images_path: list[Path] = []

        all_images_paths: list[Path] = [file for file in images_path.iterdir()]
        for i, image_path in enumerate(all_images_paths):
            if i in selected_indices:
                filtered_images_path.append(image_path)

        save_images_in: Path = images_path.parent / "detects_filetred_prevalid_scenes"
        DirectoryUtil.ensure_directory(path=str(save_images_in))

        self.save_filtered_images(where_to_save=save_images_in, filtered_images_path=filtered_images_path)
        logging.info(msg="Images have been filtered successfully.")





In [47]:
IMAGES_PATH = '/teamspace/studios/this_studio/detects_prevalid_scenes'

In [48]:
images_path = DirectoryUtil.ensure_directory(IMAGES_PATH)

2025-03-21 02:38:35,537 - INFO - Directory already exists: /teamspace/studios/this_studio/detects_prevalid_scenes


In [49]:
dino: Dino = Dino(model_name="facebook/dinov2-base")

In [50]:
dino.filter_images(images_path=Path(images_path))

2025-03-21 02:38:46,672 - INFO - Directory created: /teamspace/studios/this_studio/detects_filetred_prevalid_scenes
2025-03-21 02:38:46,674 - INFO - Image: 'test_01-Scene-012-01.png' has been copied to '/teamspace/studios/this_studio/detects_filetred_prevalid_scenes'.
2025-03-21 02:38:46,675 - INFO - Image: 'test_01-Scene-020-03.png' has been copied to '/teamspace/studios/this_studio/detects_filetred_prevalid_scenes'.
2025-03-21 02:38:46,676 - INFO - Image: 'test_01-Scene-012-02.png' has been copied to '/teamspace/studios/this_studio/detects_filetred_prevalid_scenes'.
2025-03-21 02:38:46,678 - INFO - Image: 'test_01-Scene-015-01.png' has been copied to '/teamspace/studios/this_studio/detects_filetred_prevalid_scenes'.
2025-03-21 02:38:46,680 - INFO - Image: 'test_01-Scene-009-02.png' has been copied to '/teamspace/studios/this_studio/detects_filetred_prevalid_scenes'.
2025-03-21 02:38:46,681 - INFO - Image: 'test_01-Scene-014-02.png' has been copied to '/teamspace/studios/this_studio/d

0.92352074
0.96455115
0.935274
0.9202856
0.9437146
0.9210767
0.90986896
0.9316031
0.91581583
0.9890826
0.96931374
0.924571
0.9930269
0.91147614
0.91074395
0.8939716
0.91669744
0.89400816
0.95612127
0.9585017
0.88355374
0.97176933
0.9255921
0.98304224
0.91853094
0.9148174
0.9862696
0.8926777
0.9175744
0.94213235
0.922177
0.99236596
0.98919666
0.91404474
0.9614904
0.9220123
0.9688637
0.9214772
0.9916338
0.90944433
0.9092251
0.99928844
0.8891636
0.915964
0.900712
0.9784152
0.9800377
0.88883084
0.9665389
0.89952755
0.98177195
0.9070121
0.9179348
0.9046787
0.986802
0.9840976
0.89458674
0.96339315
0.9051039
0.9868123
0.98828053
0.9226383
0.99792075
0.9892491
0.9125401
0.9675555
0.9146158
0.90574586
0.9748352
0.88298535
0.97966886
0.9132065
0.88706315
0.90372646
0.962811
0.92986166
0.9969344
0.905846
0.93568647
0.91734695
0.9870451
0.9798436
0.9042631
0.96173835
0.91156894
0.98691386
0.98220295
0.98461133
0.90103394
0.9077194
0.92422044
0.92072856
0.98857677
0.9852573
0.9092225
0.9536253
0.91