## Data Exploration & Wrangling


In [None]:
%pip install fiftyone imagehash
%pip uninstall fiftyone-db -y
%pip install fiftyone-db-ubuntu2204 --force-reinstall

In [1]:
import fiftyone as fo
from fiftyone import ViewField as F
from PIL import Image
import imagehash
from tqdm import tqdm

### Load Dataset


In [19]:
name = "til23plush"
dataset_dir = "/home/jh/code/til/data/til23plush"
splits = "train", "val", "test"

# Uncomment to recache dataset
# fo.delete_dataset(name)
if name in fo.list_datasets():
    ds = fo.load_dataset(name)
else:
    ds = fo.Dataset(name=name, persistent=True)
    for split in splits:
        ds.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            include_all_data=True,
            split=split,
            tags=split,
        )

    # Add Perceptual Hashes for Dupe Detection later
    # Due to multiple false positives, perceptual hash chosen is closer to cryptographic
    for sample in tqdm(ds):
        sample["phash"] = str(imagehash.dhash(Image.open(sample.filepath)))
        sample.save()


print(ds)

 100% |███████████████| 5664/5664 [5.8s elapsed, 0s remaining, 968.7 samples/s]      
 100% |█████████████████| 800/800 [850.2ms elapsed, 0s remaining, 943.5 samples/s]      
 100% |███████████████| 1600/1600 [168.2ms elapsed, 0s remaining, 9.5K samples/s]     


100%|██████████| 8064/8064 [02:33<00:00, 52.47it/s]

Name:        til23plush
Media type:  image
Num samples: 8064
Persistent:  True
Tags:        []
Sample fields:
    id:           fiftyone.core.fields.ObjectIdField
    filepath:     fiftyone.core.fields.StringField
    tags:         fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    ground_truth: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
    phash:        fiftyone.core.fields.StringField





### Relabel


In [20]:
view = ds.set_field(
    "ground_truth.detections", F("detections").map(F().set_field("label", "plushie"))
)

### Export


In [4]:
# See: https://docs.voxel51.com/api/fiftyone.utils.yolo.html#fiftyone.utils.yolo.YOLOv5DatasetExporter
splits = "train", "val"
config = dict(
    export_dir="/home/jh/code/til/data/til23plushonly",
    dataset_type=fo.types.YOLOv5Dataset,
    label_field="ground_truth",
    export_media="symlink",
    include_path=False,
)

for split in splits:
    v = view.filter_field("tags", F().contains([split]))
    v.export(split=split, **config)

 100% |███████████████| 5664/5664 [3.8s elapsed, 0s remaining, 1.6K samples/s]        
 100% |█████████████████| 800/800 [546.6ms elapsed, 0s remaining, 1.5K samples/s]      


### Dupe Detection

In [25]:
# Filter view to only show images where `phash` has more than 1 instance
counts = filter(lambda i: i[1] > 1, view.count_values("phash").items())
counts = [k for k, v in counts]
dupes = view.filter_field("phash", F().is_in(counts)).filter_field(
    "tags", F().contains(["train", "val"])
)

### Preview


In [None]:
fo.launch_app(dataset=dupes)