# Removing duplicates with FiftyOne

To install fiftyOne, you can use a pip install command:

pip install fiftyone or pip install fiftyone[desktop] to install the desktop app as well. Use the first option.

Use a conda environment separate for fiftyone

In [1]:
import fiftyone as fo
import fiftyone.brain as fob
from fiftyone import ViewField as F

import os
import shutil


In [2]:
name = "datasetname" # Name of dataset, note, there will be an error which may show up in the next code block which says dataset name not found, just change the name again to anything and it will work.
dataset_dir = "/home/intern/Desktop/fiftyone_test/test_folder" # Path to dataset

new_images_dir = "/home/intern/Desktop/fiftyone_test/de_duplicate_dataset" # Path to new folder where images without duplicates will be stored
labels_dir = "/home/intern/Desktop/fiftyone_test/labels" # Path to folder where labels all labels with duplicates are stored

<b> Dataset path: </b>

This is for the image dataset:
Foldername -> image1.ext, image2.ext

This is for the annotation dataset, will contain the bounding box information:
Foldername -> image1.txt, image2.txt

<b> Note: The image and annotation dataset should be separate folders. </b>

In [3]:
# Create the dataset
dataset = fo.Dataset.from_dir(
    dataset_dir=dataset_dir,
    dataset_type=fo.types.ImageDirectory, # Or ImageClassificationDirectoryTree for classification dataset
    name=name,
)

print(dataset)

 100% |███████████████████| 19/19 [23.4ms elapsed, 0s remaining, 813.4 samples/s]     
Name:        datasetname
Media type:  image
Num samples: 19
Persistent:  False
Tags:        []
Sample fields:
    id:       fiftyone.core.fields.ObjectIdField
    filepath: fiftyone.core.fields.StringField
    tags:     fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)


In [4]:
print(dataset.head(18))

[<Sample: {
    'id': '645ccbf79ebacd7d1cc38681',
    'media_type': 'image',
    'filepath': '/home/intern/Desktop/fiftyone_test/test_folder/0ad5583f-Screenshot_from_2022-09-28_14-18-26_cleanup1.jpeg',
    'tags': [],
    'metadata': None,
}>, <Sample: {
    'id': '645ccbf79ebacd7d1cc38682',
    'media_type': 'image',
    'filepath': '/home/intern/Desktop/fiftyone_test/test_folder/25628053-Screenshot_from_2022-09-28_14-42-47_cleanup.jpeg',
    'tags': [],
    'metadata': None,
}>, <Sample: {
    'id': '645ccbf79ebacd7d1cc38683',
    'media_type': 'image',
    'filepath': '/home/intern/Desktop/fiftyone_test/test_folder/5815f7ee-Screenshot_from_2022-09-28_12-00-33_cleanup.png',
    'tags': [],
    'metadata': None,
}>, <Sample: {
    'id': '645ccbf79ebacd7d1cc38684',
    'media_type': 'image',
    'filepath': '/home/intern/Desktop/fiftyone_test/test_folder/59b1978f-Screenshot_from_2022-09-28_14-43-58_cleanup.png',
    'tags': [],
    'metadata': None,
}>, <Sample: {
    'id': '645ccbf79e

In [5]:
fob.compute_uniqueness(dataset) # Compute uniqueness of dataset image samples

print(dataset)

Computing embeddings...
 100% |███████████████████| 19/19 [449.1ms elapsed, 0s remaining, 42.3 samples/s]     
Computing uniqueness...
Uniqueness computation complete
Name:        datasetname
Media type:  image
Num samples: 19
Persistent:  False
Tags:        []
Sample fields:
    id:         fiftyone.core.fields.ObjectIdField
    filepath:   fiftyone.core.fields.StringField
    tags:       fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:   fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    uniqueness: fiftyone.core.fields.FloatField


In [6]:
# View a sample from the dataset
print(dataset.first())

<Sample: {
    'id': '645ccbf79ebacd7d1cc38681',
    'media_type': 'image',
    'filepath': '/home/intern/Desktop/fiftyone_test/test_folder/0ad5583f-Screenshot_from_2022-09-28_14-18-26_cleanup1.jpeg',
    'tags': [],
    'metadata': None,
    'uniqueness': 0.9206241333312318,
}>


In [7]:
# View dataset in the App
session = fo.launch_app(dataset)

Connected to FiftyOne on port 5151 at localhost.
If you are not connecting to a remote session, you may need to start a new session and specify a port


In [22]:
# Get a view of all samples with uniqueness == 0
# duplicates_view = dataset.match({"metadata.uniqueness": 0})
small_images_view = dataset.match(F("uniqueness") == 0)

# Mark as duplicates
for sample in small_images_view:
    sample.tags.append("duplicate")
    sample.save()

In [23]:
print(small_images_view)

Dataset:     datasetname
Media type:  image
Num samples: 5
Sample fields:
    id:         fiftyone.core.fields.ObjectIdField
    filepath:   fiftyone.core.fields.StringField
    tags:       fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:   fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    uniqueness: fiftyone.core.fields.FloatField
View stages:
    1. Match(filter={'$expr': {'$eq': [...]}})


In [27]:
# Move the duplicate images to a new folder
small_images_view.export(new_images_dir, fo.types.ImageDirectory)

Directory '/home/intern/Desktop/fiftyone_test/de_duplicate_dataset' already exists; export will be merged with existing files
 100% |█████████████████████| 5/5 [32.9ms elapsed, 0s remaining, 152.2 samples/s] 


The duplicates must be selected based on the uniqueness. If the uniqueness is 0, those will be duplicates and need to be selected in the fiftyone application.

In [None]:
# # Get currently selected images from App
# dup_ids = session.selected
# print(dup_ids)

# # Get view containing selected samples
# dups_view = dataset.select(dup_ids)

# # Mark as duplicates
# for sample in dups_view:
#     sample.tags.append("duplicate")
#     sample.save()

['645b83faa730768b9742a83c', '645b83faa730768b9742a83d', '645b83faa730768b9742a840', '645b83faa730768b9742a83f', '645b83faa730768b9742a83e']


You can visualize the duplicates separately in the fiftyone application.

In [None]:
# # Select samples with `duplicate` tag
# dups_tag_view = dataset.match_tags("duplicate")

# # Open view in App
# session.view = dups_tag_view

Use the code block below to export to a new folder with the duplicates removed.

In [None]:
# # Get samples that do not have the `duplicate` tag
# no_dups_view = dataset.match(~F("tags").contains("duplicate"))

# # Export dataset to disk as a classification directory tree
# no_dups_view.export(
#     new_images_dir,
#     fo.types.ImageDirectory
# )

Directory '/home/intern/Desktop/fiftyone_test/de_duplicate_dataset' already exists; export will be merged with existing files
 100% |███████████████████| 14/14 [15.2ms elapsed, 0s remaining, 923.1 samples/s] 


Use the code block below to check if the duplicates exist in the new folder with comparison to the new directory and then delete the duplicates from the original folder.

In [28]:
# Check if files exist in the new folder with original dataset folder. If they do, delete them from the original dataset folder
for filename in os.listdir(new_images_dir):
    if filename in os.listdir(dataset_dir):
        os.remove(os.path.join(dataset_dir, filename))
        print("Removed: ", filename)

Removed:  uuid (4th copy).png
Removed:  uuid (another copy).png
Removed:  uuid (copy).png
Removed:  uuid.png
Removed:  uuid (3rd copy).png


Use the below code to check if the labels and images match. If the label does not match the image, it will be removed. This will help filter out the duplicate annotations from the label dataset.

In [None]:
# Get a list of all image filenames
image_filenames = [os.path.splitext(filename)[0] for filename in os.listdir(new_images_dir)]

# Iterate over label filenames
for label_filename in os.listdir(labels_dir):
    # Get the corresponding image filename
    image_filename = os.path.splitext(label_filename)[0]
    # Check if the label filename matches an image filename
    if image_filename not in image_filenames:
        # Delete the label file
        label_path = os.path.join(labels_dir, label_filename)
        os.remove(label_path)
