<a href="https://colab.research.google.com/github/DavidePanza/ml-jaguar-identification/blob/main/notebooks/02_dataset_creation/dino2_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
import torch
from pathlib import Path

!pip install fiftyone -q
import fiftyone as fo

from google.colab import drive
drive.mount('/gdrive')

# import from dino2_utils
notebook_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
src_path = os.path.join(notebook_dir, 'src')
sys.path.append(src_path)
from dino2_utils import DINOv2ArcFace, pad_to_square, setup_transform, get_embedding

# Overview
This pipeline is designed to extract embeddings from a set of images stored in a FiftyOne dataset using the DINOv2 model. The embeddings are computed for each image, then saved back to the dataset. These embeddings can be used for further analysis such as clustering, similarity-based retrieval, or training other models for downstream tasks.

This process involves:

1. Loading the FiftyOne dataset containing images.

2. Filtering the dataset to obtain a subset for processing.

3. Setting up the DINOv2 model and preparing it for embedding extraction.

4. Transforming images using a custom preprocessing pipeline.

5. Extracting embeddings from the images and saving them to the dataset.

6. Exporting the dataset with embeddings for future use.

# Load the Dataset
The embeddings will be extracted from the images in the uploaded FiftyOne (FO) dataset.


In [None]:
# Load the Dataset
image_dir = Path('path/to/your/images')
input_dir = Path('path/to/your/fo_dataset')

dataset = fo.Dataset.from_dir(
    dataset_dir=str(input_dir),
    dataset_type=fo.types.FiftyOneDataset,
    rel_dir=image_dir,
)

# Get train/test set of known jaguars
filtered_dataset = dataset.match({
    "testtrainsplit_cosine_similarity": {"$in": ["train", "test"]}
})

# Initialise the Model
Setup CUDA and load the DINOv2ArcFace model.
If a pretrained model is available and `use_pretrained_model` is set to True, load its weights from the given path.

In [None]:
# Setup CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the DINOv2ArcFace model in "embeddings" mode
model = DINOv2ArcFace(usage="embeddings").to(device)

# Load pretrained model weights if specified
use_pretrained_model = False
if use_pretrained_model:
    model_path = "path/to/your/model.pth"
    state_dict = torch.load(model_path, map_location="cpu")
    model.load_state_dict(state_dict)


# Extract Embeddings
Extract Embeddings from the filtered dataset using the pretrained DINOv2 model.  
* Each image is transformed, passed through the model, and the resulting embedding is saved back to the dataset.  
* CLS token from the last hidden state, projected to a lower-dimensional space (512 dim) and normalised is saved back to the dataset.


In [None]:
# initialise
transform = setup_transform(use_padding=True)
model.eval()

# get embeddings
for idx,sample in enumerate(filtered_dataset):
    if idx % 100 == 0:
        print(f"Processing sample {idx}/{len(filtered_dataset)}")
    embedding = get_embedding(sample.filepath, model, transform)
    # Add the embedding to the sample in a predefined field (e.g., "dinov2_embedding")
    sample["dinov2_embedding"] = embedding
    sample.save()

# Store Dataset

In [None]:
# store dataset metadata
storage_dir = Path('path/to/your/fo_dataset')
os.makedirs(storage_dir, exist_ok=True)

filtered_dataset.export(
    # Directory to save the datasets
    export_dir=str(storage_dir),
    dataset_type=fo.types.FiftyOneDataset,
    export_media=False,
    rel_dir=image_dir
)