<a href="https://colab.research.google.com/github/andandandand/jaguars/blob/main/notebooks/development/deduplicate_test_set_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chosing a Test set for the dataset and deduplicate to avoid data leakage.


in this notebook , we will use an approach to chose test set for our given dataset.

that is for each category (each jaguar individual) we will do the following:

1. get calculated embedding from dino V2
2. use the embedding to cluster visually similar images
3. write a selection protocol to select test data points based on split ratio:  
a. chose from least populated clusters in the dataset
b. eliminate the rest in the train dataset
( train = alldata - test)


In [None]:
import pickle
import os
from pathlib import Path
!pip install fiftyone
!pip install kneed

import fiftyone as fo
from google.colab import drive
import pandas as pd

In [34]:
import numpy as np

# Classes for

## 1. Clustering


In [143]:
# calculate similarity between embeddings :
# 1. cosine similarity between each image
# 2. kmeans clustering using knee algorithm (give a starting point based on cosine similarity)
# 3. kmediods clustering using knee algorithm (also give a range to search for based on cosine similarity)


In [None]:
# clustering_module.py
from typing import List, Dict, Optional, Tuple
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap


try:
    import umap
except ImportError:
    umap = None


class ImageClusterer:
    def __init__(self, embeddings: np.ndarray, ids: List[str]):
        self.embeddings = embeddings
        self.ids = ids

    def cosine_similarity_clustering(self, threshold: float) -> Dict[int, List[str]]:
        sim_matrix = cosine_similarity(self.embeddings)
        n = sim_matrix.shape[0]
        visited = set()
        clusters = []

        for i in range(n):
            if i in visited:
                continue
            cluster = [i]
            visited.add(i)
            for j in range(i + 1, n):
                if j not in visited and sim_matrix[i, j] >= threshold:
                    cluster.append(j)
                    visited.add(j)
            clusters.append(cluster)

        # Return dict: cluster_id -> list of ids
        return {i: [self.ids[idx] for idx in cluster] for i, cluster in enumerate(clusters)}

    def kmeans_clustering(self, n_clusters: int) -> Dict[int, List[str]]:
        model = KMeans(n_clusters=n_clusters, random_state=42)
        labels = model.fit_predict(self.embeddings)
        cluster_dict = {}
        for idx, label in enumerate(labels):
            cluster_dict.setdefault(label, []).append(self.ids[idx])
        return cluster_dict

    def kmedoids_clustering(self, *args, **kwargs):
        raise NotImplementedError("KMedoids clustering not yet implemented.")


class ClusterVisualizer:
    def __init__(self, embeddings: np.ndarray, ids: List[str], cluster_dict: Dict[int, List[str]]):
        self.embeddings = embeddings
        self.ids = ids
        self.cluster_dict = cluster_dict
        self.id_to_index = {id_: idx for idx, id_ in enumerate(ids)}

    def _get_cluster_labels(self) -> List[int]:
        labels = [-1] * len(self.ids)
        for cluster_id, id_list in self.cluster_dict.items():
            for id_ in id_list:
                idx = self.id_to_index[id_]
                labels[idx] = cluster_id
        return labels

    def plot(self, method: str = "pca") -> None:
        labels = self._get_cluster_labels()
        if method.lower() == "pca":
            reducer = PCA(n_components=2)
        elif method.lower() == "tsne":
            reducer = TSNE(n_components=2, random_state=42)
        elif method.lower() == "umap":
            if umap is None:
                raise ImportError("UMAP is not installed. Please install with `pip install umap-learn`.")
            reducer = umap.UMAP(n_components=2, random_state=42)
        else:
            raise ValueError(f"Unknown method '{method}'. Choose from ['pca', 'tsne', 'umap'].")

        reduced = reducer.fit_transform(self.embeddings)

        plt.figure(figsize=(10, 6))
        # Create a large palette
        # Get 20 distinct colors from tab20
        palette1 = sns.color_palette("tab20", 20)

        # Get 40 distinct colors from husl (spread across hue spectrum)
        palette2 = sns.color_palette("tab20b", 20)
        palette3 = sns.color_palette("tab20c", 20)

        # Combine them
        combined_palette = palette1 + palette2 + palette3 # total = 60
        cmap_60 = ListedColormap(combined_palette)


        # Scatter plot using custom color map
        scatter = plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap=cmap_60, s=30)
        plt.title(f"{method.upper()} of Image Embeddings")
        plt.colorbar(scatter, label="Cluster ID")
        plt.grid(True)
        plt.show()


## finding optimal K value

In [144]:
from sklearn.cluster import KMeans
from kneed import KneeLocator
import matplotlib.pyplot as plt
import numpy as np

def find_optimal_k_kmeans(embeddings: np.ndarray, k_range: range, plot: bool = True) -> int:
    inertias = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(embeddings)
        inertias.append(kmeans.inertia_)

    kneedle = KneeLocator(list(k_range), inertias, curve="convex", direction="decreasing")

    if plot:
        plt.figure(figsize=(8, 4))
        plt.plot(k_range, inertias, 'bo-')
        if kneedle.knee:
            plt.axvline(x=kneedle.knee, color='r', linestyle='--', label=f"Knee at k={kneedle.knee}")
        plt.xlabel("k (number of clusters)")
        plt.ylabel("Inertia (within-cluster sum of squares)")
        plt.title("Elbow Method for Optimal k (KMeans)")
        plt.legend()
        plt.grid(True)
        plt.show()

    return kneedle.knee if kneedle.knee else k_range.start


## Assign to dataset

In [154]:
def assign_clusters_to_fiftyone(dataset: fo.Dataset, cluster_dict: Dict[int, List[str]], field_name: str = "cluster") -> None:
    # Build a flat dict: sample_id -> cluster_id
    id_to_cluster = {
        sample_id: cluster_id
        for cluster_id, sample_ids in cluster_dict.items()
        for sample_id in sample_ids
    }

    # Efficient bulk assignment
    dataset.set_values(field_name, id_to_cluster, key_field="id")

## Train test split without data leakage

In [146]:
import random
from typing import Dict, List, Union

class ClusterDataSelector:
    """
    A utility class for selecting training, testing, eliminated, and unknown samples
    from image clusters based on predefined rules.

    This class is designed to work with the output of clustering algorithms,
    where each cluster is represented as a list of sample IDs. It selects a subset
    of clusters for testing (and elimination), and assigns the rest to training.
    If the number of clusters is below a threshold, all samples are labeled as unknown.

    Selection Rules:
    - If the number of clusters is less than `cluster_threshold`, all samples are assigned to "unknown".
    - Otherwise:
        1. Clusters are sorted by size (ascending).
        2. A fraction (`test_fraction`) of the clusters (starting from the smallest) is selected.
        3. For each selected cluster, one random sample is chosen for "test"; the others are "eliminated".
        4. Remaining clusters are labeled as "train".

    Attributes:
        cluster_dict (Dict[int, List[str]]):
            Dictionary mapping cluster IDs to lists of sample IDs.
        cluster_threshold (int):
            Minimum number of clusters required to proceed with selection (default is 5).
        test_fraction (float):
            Fraction of clusters to be used for test + eliminated (default is 0.2).
        seed (int):
            Random seed for reproducibility (default is 42).

    Methods:
        select() -> Dict[str, List[str]]:
            Applies the selection rule and returns a dictionary with keys:
            "train", "test", "eliminated", and "unknown", each mapping to a list of sample IDs.

    Example:
        cluster_dict = {
            0: ['img1', 'img2'],
            1: ['img3'],
            2: ['img4', 'img5', 'img6'],
            ...
        }

        selector = ClusterDataSelector(cluster_dict, cluster_threshold=5, test_fraction=0.3)
        split_dict = selector.select()

        # Output:
        # {
        #     'train': [...],
        #     'test': [...],
        #     'eliminated': [...],
        #     'unknown': [...]
        # }
    """
    def __init__(
        self,
        cluster_dict: Dict[int, List[str]],
        cluster_threshold: int = 5,
        test_fraction: float = 0.2,
        seed: int = 42
    ):
        self.cluster_dict = cluster_dict
        self.cluster_threshold = cluster_threshold
        self.test_fraction = test_fraction
        self.seed = seed
        random.seed(seed)

    def select(self) -> Dict[str, List[str]]:
        total_clusters = len(self.cluster_dict)

        if total_clusters < self.cluster_threshold:
            # Everything is unknown
            all_ids = [id_ for ids in self.cluster_dict.values() for id_ in ids]
            return {"train": [], "test": [], "eliminated": [], "unknown": all_ids}

        # Sort clusters by size (least to most populated)
        sorted_clusters = sorted(self.cluster_dict.items(), key=lambda item: len(item[1]))
        num_test_clusters = int(total_clusters * self.test_fraction)

        test_clusters = sorted_clusters[:num_test_clusters]
        train_clusters = sorted_clusters[num_test_clusters:]

        train_ids = []
        test_ids = []
        eliminated_ids = []

        # Handle test clusters: 1 sample for test, rest eliminated
        for cluster_id, ids in test_clusters:
            if len(ids) == 0:
                continue
            test_id = random.choice(ids)
            test_ids.append(test_id)
            eliminated_ids.extend([id_ for id_ in ids if id_ != test_id])

        # Handle train clusters: all ids go to train
        for cluster_id, ids in train_clusters:
            train_ids.extend(ids)

        return {
            "train": train_ids,
            "test": test_ids,
            "eliminated": eliminated_ids,
            "unknown": []
        }


In [147]:
def assign_split_flags(dataset: fo.Dataset, split_dict: Dict[str, List[str]], field_name: str = "split"):
    # Optional: set default to "unspecified" for all
    dataset.set_values(field_name, ["unspecified"] * len(dataset))

    # Assign values by cluster group using sample ID as key
    for split_name, ids in split_dict.items():
        dataset.set_values(
            field_name,
            {sample_id: split_name for sample_id in ids},
            key_field="id"
        )


# Start the pipeline

In [155]:
# read dataset


# mount shahab
#drive.mount('/content/drive')
# mount davide
#drive.mount('/gdrive')

project_path = {"shahab" : Path("/content/drive/MyDrive/DataScience/Jaguars_Project") ,
             "davide" : Path("/gdrive/MyDrive/DSR/Jaguars_Project/")}

drive.mount("/content/drive/")


user = "shahab"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [156]:
image_dir = project_path[user] / Path("images/cropped_body")
input_dir = project_path[user] / Path("datasets/dataset_filtered")

dataset = fo.Dataset.from_dir(
    dataset_dir=str(input_dir),
    dataset_type=fo.types.FiftyOneDataset,
    rel_dir=image_dir,
)

Importing samples...


INFO:fiftyone.utils.data.importers:Importing samples...


 100% |███████████████| 3598/3598 [933.9ms elapsed, 0s remaining, 3.9K samples/s]      


INFO:eta.core.utils: 100% |███████████████| 3598/3598 [933.9ms elapsed, 0s remaining, 3.9K samples/s]      


In [157]:
# eliminate badly segmented images
bad_segmentations_path = project_path[user] / "datasets/wrong_segmentations_ids/bad_body_segmentations.pkl"
bad_segmentations_ids = pickle.load(open(bad_segmentations_path, "rb"))
dataset.delete_samples(bad_segmentations_ids)

# additional list of bad images
bad_segmentations = [
"A_Solar_17.jpg", "A_Saseka_117.jpg", "A_Saseka_79.jpg", "A_Saseka_80.jpg", "A_Saseka_60.jpg", "A_Saseka_61.jpg", "A_Saseka_62.jpg", "A_Saseka_63.jpg",
"A_Saseka_64.jpg", "A_Saseka_15.jpg", "A_Saseka_81.jpg", "A_Saseka_84.jpg", "A_Saseka_85.jpg", "A_Saseka_86.jpg", "A_Saseka_87.jpg",
"A_Saseka_88.jpg", "A_Saseka_89.jpg", "A_Saseka_90.jpg", "A_Patricia_10.jpg", "A_Patricia_11.jpg", "A_Patricia_12.jpg", "A_Overa_1.jpg", "A_Overa_2.jpg",
"A_Overa_5.jpg", "A_Overa_6.jpg", "A_Ousado_115.jpg", "A_Ousado_114.jpg", "A_Medrosa_155.jpg", "A_Medrosa_156.jpg", "A_Medrosa_124.jpg", "A_Medrosa_125.jpg", "A_Medrosa_107.jpg", "A_Medrosa_85.jpg", "A_Medrosa_86.jpg",
"A_Medrosa_75.jpg", "A_Medrosa_76.jpg", "A_Medrosa_57.jpg", "A_Medrosa_58.jpg", "A_Marcela_287.jpg", "A_Marcela_288.jpg", "A_Marcela_289.jpg", "A_Marcela_290.jpg", "A_Marcela_291.jpg",
"A_Marcela_284.jpg", "A_Marcela_285.jpg", "A_Marcela_188.jpg", "A_Marcela_189.jpg", "A_Marcela_190.jpg", "A_Marcela_191.jpg", "A_Marcela_192.jpg", "A_Marcela_193.jpg", "A_Marcela_194.jpg",
"A_Marcela_195.jpg", "A_Marcela_196.jpg", "A_Marcela_197.jpg", "A_Marcela_198.jpg", "A_Marcela_199.jpg", "A_Marcela_200.jpg", "A_Marcela_201.jpg",
"A_Marcela_202.jpg", "A_Marcela_203.jpg", "A_Marcela_204.jpg", "A_Marcela_205.jpg", "A_Marcela_206.jpg", "A_Marcela_207.jpg", "A_Marcela_208.jpg",
"A_Marcela_209.jpg", "A_Marcela_210.jpg", "A_Marcela_165.jpg", "A_Marcela_80.jpg", "A_Marcela_28.jpg", "A_Lua_148.jpg", "A_Lua_149.jpg", "A_Kyyavera_8.jpg", "A_Kyyavera_9.jpg", "A_Kyyavera_10.jpg", "A_Kyyavera_11.jpg",
"A_Kwang_201.jpg", "A_Kwang_202.jpg", "A_Kwang_26.jpg", "A_Kwang_27.jpg", "A_Kwang_28.jpg", "A_Kwang_29.jpg", "A_Kwang_30.jpg", "A_Kwang_31.jpg", "A_Kwang_32.jpg", "A_Kwang_33.jpg",
"A_Katniss_18.jpg", "A_Katniss_17.jpg", "A_Katniss_1.jpg", "A_Katniss_2.jpg", "A_Kamaikua_200.jpg", "A_Kamaikua_201.jpg", "A_Kamaikua_202.jpg", "A_Kamaikua_203.jpg", "A_Kamaikua_1.jpg", "A_Kamaikua_2.jpg",
"A_Jaju_201.jpg", "A_Jaju_153.jpg", "A_Jaju_90.jpg", "A_Jaju_76.jpg", "A_Ipepo_40.jpg", "A_Ipepo_41.jpg", "A_Ipepo_44.jpg", "A_Ipepo_45.jpg",
"A_Ipepo_46.jpg", "A_Ipepo_47.jpg", "A_Ipepo_48.jpg", "A_Ipepo_49.jpg", "A_Ipepo_50.jpg", "A_Ipepo_51.jpg", "A_Ipepo_52.jpg", "A_Ipepo_53.jpg", "A_Ipepo_54.jpg",
"A_Ipepo_55.jpg", "A_Ipepo_56.jpg", "A_Ipepo_57.jpg", "A_Ipepo_58.jpg", "A_Ipepo_59.jpg", "A_Ipepo_60.jpg", "A_Ipepo_61.jpg", "A_Ipepo_62.jpg", "A_Ipepo_63.jpg",
"A_Ipepo_64.jpg", "A_Ipepo_65.jpg", "A_Ipepo_66.jpg", "A_Ipepo_67.jpg", "A_Ipepo_68.jpg", "A_Ipepo_69.jpg", "A_Ipepo_70.jpg", "A_Ipepo_71.jpg", "A_Ipepo_72.jpg",
"A_Ipepo_73.jpg", "A_Ipepo_74.jpg", "A_Ipepo_75.jpg", "A_Ipepo_76.jpg", "A_Ipepo_77.jpg", "A_Ipepo_78.jpg", "A_Ipepo_79.jpg", "A_Ipepo_80.jpg",
"A_Ipepo_81.jpg", "A_Ipepo_82.jpg", "A_Inka_46.jpg", "A_Bororo_21.jpg", "A_Bororo_22.jpg", "A_Bororo_23.jpg", "A_Bororo_24.jpg", "A_Bororo_25.jpg", "A_Bororo_26.jpg", "A_Bororo_27.jpg",
"A_Bororo_17.jpg", "A_Bororo_18.jpg", "A_Bororo_5.jpg", "A_Bororo_1.jpg", "A_Bernard_35.jpg", "A_Bagua_123.jpg", "A_Bagua_124.jpg", "A_Bagua_125.jpg", "A_Bagua_120.jpg", "A_Bagua_100.jpg", "A_Bagua_101.jpg",
"A_Apeiara_23.jpg", "A_Apeiara_26.jpg", "A_Abril_12.jpg", "P_Hero_9.png"
]

# eliminate additional bad images
dataset.delete_samples(
    dataset.match(
        fo.ViewField("metadata.image_name").is_in(bad_segmentations)
    )
)

In [18]:
dataset

Name:        2025.04.08.12.58.04
Media type:  image
Num samples: 3598
Persistent:  False
Tags:        []
Sample fields:
    id:                  fiftyone.core.fields.ObjectIdField
    filepath:            fiftyone.core.fields.StringField
    tags:                fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:            fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    created_at:          fiftyone.core.fields.DateTimeField
    last_modified_at:    fiftyone.core.fields.DateTimeField
    ground_truth:        fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Classification)
    prediction:          fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
    segmentations_body:  fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
    bboxes_head:         fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
    segmentations_head:  fiftyone.core

In [158]:
# find out all categories .:
all_labels = dataset.values("ground_truth.label")
labels = list(set(all_labels))
pd.Series(all_labels).value_counts()

Unnamed: 0,count
Medrosa,326
Marcela,318
Ousado,311
Kwang,242
Lua,181
Jaju,171
Kamaikua,165
Ti,158
Saseka,125
Benita,113


In [159]:
from fiftyone import ViewField as F

embedding_name = "dinov2_embedding_v2"
for label in labels :

    print(f"working on {label}")

    # filter the dataset :

    filtered_view = dataset.filter_labels("ground_truth", F("label") == label)


    # get the embeddings for that individual
    # Create a filtered view where the embedding exists (is not None)
    view = filtered_view.match(F(embedding_name) != None)

    # Get the embeddings as a NumPy array
    embeddings = np.array(view.values(embedding_name))

    # Get the corresponding sample IDs
    sample_ids = view.values("id")

    print (f"number of images for {label} : {len(sample_ids)}")

    clusterer = ImageClusterer(embeddings, sample_ids)
    #
    # calculate similar image and cluster them based on cosine similarity
    cosine_clusters = clusterer.cosine_similarity_clustering(threshold=0.9)
    assign_clusters_to_fiftyone(view, cosine_clusters , field_name="cluster_cosine_similarity")

    #cosine_clusters = clusterer.cosine_similarity_clustering(threshold=0.9)

    print( f" number of total clusters {len(cosine_clusters)} for {label}")
    # Now select the data based on the rule
    selector = ClusterDataSelector(cosine_clusters, cluster_threshold=5, test_fraction=0.3)
    selection_result = selector.select()



    print( f"number of train images {len(selection_result['train'])}")
    print( f"number of test images {len(selection_result['test'])}")
    print( f"number of eliminated images {len(selection_result['eliminated'])}")
    print( f"number of unknown images {len(selection_result['unknown'])}")

    assign_split_flags(view, selection_result, field_name="testtrainsplit_cosine_similarity")








working on Kwang
number of images for Kwang : 242
 number of total clusters 35 for Kwang
number of train images 227
number of test images 10
number of eliminated images 5
working on Lua
number of images for Lua : 181
 number of total clusters 12 for Lua
number of train images 177
number of test images 3
number of eliminated images 1
working on Kyyavera
number of images for Kyyavera : 82
 number of total clusters 3 for Kyyavera
number of train images 0
number of test images 0
number of eliminated images 0
working on Ague
number of images for Ague : 6
 number of total clusters 2 for Ague
number of train images 0
number of test images 0
number of eliminated images 0
working on Tomas
number of images for Tomas : 97
 number of total clusters 7 for Tomas
number of train images 94
number of test images 2
number of eliminated images 1
working on Pixana
number of images for Pixana : 81
 number of total clusters 9 for Pixana
number of train images 78
number of test images 2
number of eliminated 

In [None]:
fo.launch_app(dataset)

In [161]:
# create the filtered dataset
base_dir = project_path[user] / Path('images/cropped_body')
storage_dir = project_path[user] / Path('datasets/version_16')

dataset.export(
    # Directory to save the datasets
    export_dir=str(storage_dir),
    dataset_type=fo.types.FiftyOneDataset,
    export_media=False,
    rel_dir=base_dir
)

Exporting samples...


INFO:fiftyone.utils.data.exporters:Exporting samples...


 100% |██████████████████| 3598/3598 [8.7s elapsed, 0s remaining, 710.5 docs/s]      


INFO:eta.core.utils: 100% |██████████████████| 3598/3598 [8.7s elapsed, 0s remaining, 710.5 docs/s]      
