In [None]:
!pip install torch torchvision roboflow scikit-learn

In [None]:
from roboflow import Roboflow
rf = Roboflow(api_key="API_KEY_HERE")
project = rf.workspace("matyworkspace").project("traffic-signs-cl-damaged-healthy")
version = project.version(2)
dataset = version.download("folder")

In [None]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torchvision.models import resnet50
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import shutil

In [None]:
parent_dir = './traffic-signs-cl-damaged-healthy-2/'

data_dirs = [os.path.join(parent_dir, subdir) for subdir in ['train', 'valid', 'test']]

In [None]:
# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)), # ResNet50 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # https://docs.pytorch.org/vision/0.12/models.html#classification
])

all_image_paths = []
all_image_embeddings = []

# Use pretrained ResNet50 model
model = resnet50(pretrained=True)
model.fc = torch.nn.Identity()  # Remove final classification layer
model.eval()
model.cuda()  # move to GPU if available

def get_embedding(image_tensor):
    with torch.no_grad():
        image_tensor = image_tensor.unsqueeze(0).cuda()
        embedding = model(image_tensor)
        return embedding.cpu().numpy().flatten()

In [None]:
for data_dir in data_dirs:
    folder_path = os.path.abspath(data_dir)
    dataset = ImageFolder(folder_path, transform=transform)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)

    for i, (img, _) in enumerate(tqdm(loader)):
        img_path = dataset.samples[i][0]
        embedding = get_embedding(img[0])
        all_image_embeddings.append(embedding)
        all_image_paths.append(img_path)

In [None]:
# Compute cosine similarity
embeddings_np = np.array(all_image_embeddings)
similarity_matrix = cosine_similarity(embeddings_np)

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
similarities = []
for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        sim = similarity_matrix[i, j]
        if 0.92 <= sim <= 1:
            similarities.append(sim)

bins = np.arange(0.92, 1.0, 0.002)
plt.hist(similarities, bins=bins, edgecolor='black')
plt.title('Similarity Score Distribution (0.92 - 0.99)')
plt.xlabel('Cosine Similarity')
plt.ylabel('Number of Image Pairs')
plt.grid(True)
plt.show()

In [None]:
def show_image_pair(path1, path2):
    img1 = Image.open(path1).convert("RGB")
    img2 = Image.open(path2).convert("RGB")
    fig, axs = plt.subplots(1, 3, figsize=(8, 4))
    axs[0].imshow(img1)
    axs[0].set_title(os.path.basename(path1))
    axs[0].axis('off')
    axs[1].axis('off')
    axs[2].imshow(img2)
    axs[2].set_title(os.path.basename(path2))
    axs[2].axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
# Find duplicates or near-duplicates
threshold = 0.95 
visited = set()
total = 0

to_remove = set()

for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        if similarity_matrix[i, j] > threshold:
            pair = tuple(sorted([all_image_paths[i], all_image_paths[j]]))
            if pair not in visited:
                visited.add(pair)
                if pair[1] in to_remove:
                    to_remove.add(pair[0])
                else:
                    to_remove.add(pair[1])

print(f"Found {len(visited)} duplicate pairs! To remove: {len(to_remove)}")

In [None]:
to_remove_list = list(to_remove)
for file in to_remove_list:
    os.remove(file)

In [None]:
# Output directory
output_dir = '/kaggle/working/cleaned_dataset'
class_names = ['ok', 'not_ok']

# Create class folders
for cls in class_names:
    os.makedirs(os.path.join(output_dir, cls), exist_ok=True)

# Copy and rename files
counter = 0
for path in sorted(all_image_paths):
    class_folder = 'ok' if '/ok/' in path else 'not_ok'
    ext = os.path.splitext(path)[-1].lower()
    new_name = f"{counter:06d}{ext}"
    dest_path = os.path.join(output_dir, class_folder, new_name)

    try:
        shutil.copy2(path, dest_path)
        counter += 1
    except Exception as e:
        print(f"Failed to copy {path}: {e}")

print(f"Reorganized {counter} images into {output_dir}")

In [None]:
from torchvision.models import resnet50, ResNet50_Weights
# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load dataset
all_image_paths = []
all_image_embeddings = []

# Use pretrained ResNet50 model
model = resnet50(weights=ResNet50_Weights)
model.fc = torch.nn.Identity()  # Remove final classification layer
model.eval()
model.cuda()  # move to GPU if available

def get_embedding(image_tensor):
    with torch.no_grad():
        image_tensor = image_tensor.unsqueeze(0).cuda()
        embedding = model(image_tensor)
        return embedding.cpu().numpy().flatten()

In [None]:
folder_path = os.path.abspath('/kaggle/working/cleaned_dataset')
dataset = ImageFolder(folder_path, transform=transform)
loader = DataLoader(dataset, batch_size=1, shuffle=False)

for i, (img, _) in enumerate(tqdm(loader)):
    img_path = dataset.samples[i][0]
    embedding = get_embedding(img[0])
    all_image_embeddings.append(embedding)
    all_image_paths.append(img_path)

In [None]:
# Compute cosine similarity
print("Computing cosine similarity matrix...")
embeddings_np = np.array(all_image_embeddings)
similarity_matrix = cosine_similarity(embeddings_np)

In [None]:
# Assuming similarity_matrix is already computed
similarities = []
for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        sim = similarity_matrix[i, j]
        if 0.92 <= sim <= 1:
            similarities.append(sim)

# Create histogram
bins = np.arange(0.92, 1.0, 0.002)
plt.hist(similarities, bins=bins, edgecolor='black')
plt.title('Similarity Score Distribution (0.92 - 0.99)')
plt.xlabel('Cosine Similarity')
plt.ylabel('Number of Image Pairs')
plt.grid(True)
plt.show()

In [None]:
# Find duplicates or near-duplicates
threshold = 0.948  # adjust as needed for sensitivity
visited = set()
total = 0

to_remove = set()

print("Finding duplicates...")
for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        #if similarity_matrix[i, j] > threshold:
        if threshold < similarity_matrix[i,j] <= threshold + 0.004:
            pair = tuple(sorted([all_image_paths[i], all_image_paths[j]]))
            if pair not in visited:
                visited.add(pair)
                if pair[1] in to_remove:
                    to_remove.add(pair[0])
                else:
                    to_remove.add(pair[1])

print(f"Found {len(visited)} duplicate pairs! To remove: {len(to_remove)}")