# Embedding workflow using DINOv2

This notebook focuses on the **Feature Extraction** pipeline. 

We utilize the fine-tuned model **ViTD2PC24All** ([DINOv2](https://dinov2.metademolab.com/)) to extract high-dimensional embeddings from the single-label train images and multi-label test images.

We'll **visualize**, **tile**, and **process** these embeddings to support patch-wise multi-label inference using PyTorch and Faiss.

![diagram](../images/pytorch-webinar-diagram.png)

In [1]:
# !uv pip list | grep pydantic
# !uv pip install meerkat-ml
# !which pip

# !uv pip install pyspark -v

In [2]:
%load_ext autoreload
%autoreload 2

## Now to load the parquet file from disk and visualize the images

In [3]:
import pandas as pd

pd.options.display.precision = 2
pd.options.display.max_rows = 10

root_dir = "/teamspace/studios/this_studio/plantclef-vision/data/plantclef2025"
dataset_dir = "/teamspace/studios/this_studio/plantclef-vision/data/plantclef2025/competition-metadata/PlantCLEF2025_test_images/PlantCLEF2025_test_images"
hf_dataset_dir = "/teamspace/studios/this_studio/plantclef-vision/data/parquet/plantclef2025/full_test/HF_dataset"

In [4]:
# from plantclef.pytorch.data import HFPlantDataset
from torchvision import transforms
from typing import Callable
import torch



In [5]:
# def transform_dict(transforms: Callable, key: str) -> Callable:
#     """Apply transformation to a specific key in the dataset."""

#     def transform_fn(row):
#         row[key] = [transforms(image) for image in row[key]]
#         return row

#     return transform_fn


# def create_transform(image_size: int, key: Optional[str] = None) -> Callable:
#     """Create image transformation pipeline that maintains aspect ratio."""
#     transform_list = [
#         # transforms.ToPILImage(),
#         transforms.Resize(
#             image_size, max_size=image_size + 2
#         ),  # Maintains aspect ratio
#         transforms.CenterCrop(image_size),
#         transforms.ToTensor(),
#     ]
#     transform_list = transforms.Compose(transform_list)
#     if key is not None:
#         return transform_dict(transform_list, key)
#     return transform_list

## Running torch_pipeline with HFPlantDataset

In [43]:
from plantclef.embed.workflow import torch_pipeline
import os


class Config:
    use_grid: bool = True
    grid_size: int = 3
    image_size: int = 546
    batch_size: int = 4
    cpu_count: int = os.cpu_count() or 1
    top_k: int = 5

    root_dir: str = "/teamspace/studios/this_studio/plantclef-vision/data/plantclef2025"
    dataset_dir: str = "/teamspace/studios/this_studio/plantclef-vision/data/plantclef2025/competition-metadata/PlantCLEF2025_test_images/PlantCLEF2025_test_images"
    hf_dataset_dir: str = "/teamspace/studios/this_studio/plantclef-vision/data/parquet/plantclef2025/full_test/HF_dataset"

    embeddings_dir: str = None
    test_embeddings_dir: str = None
    folder_name: str = None
    test_embeddings_path: str = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embeddings_dir = f"{self.root_dir}/embeddings"
        self.test_embeddings_dir = f"{self.embeddings_dir}/full_test"
        self.folder_name = f"test_grid_{self.grid_size}x{self.grid_size}_embeddings"
        self.test_embeddings_path = f"{self.test_embeddings_dir}/{self.folder_name}"

In [None]:
# cfg = Config()


# def make_predictions_and_save(
#     cfg: Config,
# ):
#     """Make predictions and save them to disk."""
#     # Create the directory if it doesn't exist
#     os.makedirs(cfg.test_embeddings_dir, exist_ok=True)


#     ds = HFPlantDataset(
#         path=cfg.hf_dataset_dir,
#         transform=None,  # model.transform,
#         col_name="image",
#         use_grid=cfg.use_grid,
#         grid_size=cfg.grid_size,
#     )

#     ds.transform = ds.get_transforms(cfg.image_size)


#     embeddings, logits = torch_pipeline(
#         ds,
#         batch_size=cfg.batch_size,
#         use_grid=cfg.use_grid,
#         grid_size=cfg.grid_size,
#         cpu_count=cfg.cpu_count,
#         top_k=cfg.top_k
#     )

#     pred_df = create_predictions_df(
#         ds,
#         embeddings,
#         logits
#     )

#     pred_ds = HFDataset.from_pandas(pred_df)
#     pred_ds.save_to_disk(test_embeddings_path)

In [7]:
from plantclef.pytorch.model import DINOv2LightningModel

top_k = 5
model = DINOv2LightningModel(top_k=top_k)
model.transform

Compose(
    Resize(size=518, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(518, 518))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)

## Explore embeddings

In [None]:
# for batch in dataloader:

#     break

# print(type(batch))

# print(batch.shape)

<class 'torch.Tensor'>
torch.Size([4, 9, 3, 182, 182])


### Get embeddings and logits from model.predict_step

In [39]:
# grid_size = cfg.grid_size

# embeddings, logits = model.predict_step(
#             batch, batch_idx=0
#         )

# embeddings, logits = model.predict_grid_step(
#             batch, batch_idx=0
#         )

# print(embeddings.shape)
# print(len(logits))

torch.Size([4, 9, 768])
4


In [40]:
# print(embeddings.shape)
# embeddings = embeddings.view(-1, grid_size**2, 768)
# print(embeddings.shape)
# embeddings = embeddings.view(-1, grid_size**2, 768)

# logits = [
#             logits[i : i + grid_size**2] for i in range(0, len(logits), grid_size**2)
#         ]

torch.Size([4, 9, 768])
torch.Size([4, 9, 768])


In [None]:
# print(embeddings.shape)
# print(len(logits))

torch.Size([4, 9, 768])
4


In [36]:
# print(embeddings.shape)
# print(len(logits))
# print([l.keys() for l in logits])

torch.Size([36, 768])
36


In [38]:
# logits[0]

{'1395063': 0.32624733448028564,
 '1395117': 0.06265220791101456,
 '1664563': 0.02622845023870468,
 '1394850': 0.02485588751733303,
 '1360224': 0.024763811379671097}

In [37]:
# print(f"batch_size -- len(logits): {len(logits)}")
# print(f"grid_size**2 -- len(logits[0]): {len(logits[0])}")
# # logits_img0_tile0 = logits[0][0]
# print(f"top_k -- k = len(list(logits[0][0].keys())): {len(list(logits[0][0].keys()))}")

batch_size -- len(logits): 36
grid_size**2 -- len(logits[0]): 5


KeyError: 0

### Get image names from HFDataset -> Create a pandas DataFrame to match image names to logits + embeddings

In [None]:
# embeddings, logits = model.predict_grid_step(
#             batch, batch_idx=0
#         )

In [12]:
# def create_predictions_df(
#     ds: HFPlantDataset, embeddings: torch.Tensor, logits: list
# ) -> pd.DataFrame:
#     """
#     Accepts an HFPlantDataset and a set of embeddings and logits.

#     To be called after the model has been run on the full dataset in ds.

#     Returns a DataFrame with the following columns:
#         - image_name
#         - tile
#         - embeddings
#         - logits
#     The DataFrame is exploded to have one row per tile.

#     """

#     pred_df = pd.DataFrame({"image_name": ds.dataset["file_path"]})
#     pred_df["image_name"] = pred_df["image_name"].str.rsplit("/", n=1, expand=True)[1]

#     pred_df = pred_df.convert_dtypes()

#     pred_df = pred_df.assign(embeddings=embeddings.cpu().tolist(), logits=logits)
#     explode_df = pred_df.explode(["embeddings", "logits"], ignore_index=True)
#     explode_df = explode_df.assign(tile=explode_df.groupby("image_name").cumcount())

#     return explode_df


# pred_ds = HFDataset.from_pandas(explode_df)
# pred_ds.save_to_disk(test_embeddings_path)

In [21]:
# loaded_ds = Dataset.load_from_disk(test_embeddings_path)
# loaded_ds.features["logits"]

In [43]:
import json
import shutil
import numpy as np


def write_embeddings_to_parquet(
    df: pd.DataFrame,
    folder_name: str,
    num_partitions: int = 20,
):
    # path to data
    root = Path().resolve().parents[0]
    data_path = f"{root}/data/embeddings"
    output_path = f"{data_path}/{folder_name}"

    # remove existing data if it exists to avoid duplication
    if Path(output_path).exists():
        shutil.rmtree(output_path, ignore_errors=True)

    # convert logits to json strings
    df["logits"] = df["logits"].apply(json.dumps)

    # assign partition numbers (0 to num_partitions-1)
    df_size = len(df)
    df["partition"] = np.repeat(
        np.arange(num_partitions), np.ceil(df_size / num_partitions)
    )[:df_size]

    # write to parquet using the new partition column
    df.to_parquet(output_path, partition_cols=["partition"], index=False)

    print(
        f"Embedding dataset written to: {output_path} with {num_partitions} partitions."
    )


# write data
# folder_name = f"test_grid_{GRID_SIZE}x{GRID_SIZE}_embeddings"
# write_embeddings_to_parquet(test_explode_df, folder_name, num_partitions=10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_path   4 non-null      string
 1   embeddings  4 non-null      object
 2   logits      4 non-null      object
dtypes: object(2), string(1)
memory usage: 224.0+ bytes


In [None]:
# ds.plot_image_tiles(idx=50)

# Misc below

In [52]:
# ds.dataset = ds.dataset.take(100)
# # extract embeddings
# embeddings, logits = torch_pipeline(
#     dataset=ds,  # .dataset.take(5),
#     batch_size=2,
#     use_grid=True,
#     cpu_count=1,
# )
# embeddings.shape
# grid_size = 3

# embeddings = embeddings.view(-1, grid_size**2, 768)
# embeddings.shape
# import matplotlib.pyplot as plt

# img = ds._get_image_tensor(0)

# plt.imshow(img.permute(1, 2, 0))

In [None]:
def center_crop(image: torch.Tensor) -> torch.Tensor:
    min_dim = min(image.shape[1:])
    return transforms.CenterCrop(min_dim)(image)

In [32]:
import torch

## Save huggingface test set to disk

In [20]:
# image_list = collect_image_filepaths(dataset_dir)

# ds = Dataset.from_dict({"image": image_list})
# ds = ds.cast_column("image", Image())

# ds.save_to_disk(hf_dataset_dir)

Collecting file paths in /teamspace/studios/this_studio/plantclef-vision/data/plantclef2025/competition-metadata/PlantCLEF2025_test_images/PlantCLEF2025_test_images: 100%|██████████| 2105/2105 [00:00<00:00, 837905.47it/s]
Walking through dir /teamspace/studios/this_studio/plantclef-vision/data/plantclef2025/competition-metadata/PlantCLEF2025_test_images/PlantCLEF2025_test_images: 1it [00:00, 20.22it/s]


Saving the dataset (0/18 shards):   0%|          | 0/2105 [00:00<?, ? examples/s]

In [33]:
# ds_loaded = Dataset.load_from_disk(hf_dataset_dir)

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

In [53]:
def create_transform(image_size: int) -> Callable:
    """Create image transformation pipeline that maintains aspect ratio."""
    transform_list = [
        # transforms.ToPILImage(),
        transforms.Resize(
            image_size, max_size=image_size + 2
        ),  # Maintains aspect ratio
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
    ]

    return transforms.Compose(transform_list)

In [44]:
# dataset = ds_loaded

# Misc below

In [None]:
import torch
from plantclef.config import get_device
import pandas as pd
from pathlib import Path

print(f"PyTorch Version: {torch.__version__}")
device = get_device()
print(f"Using device: {device}")


# Get list of stored filed in cloud bucket
root = Path().resolve().parents[0]
print(root)
! date

In [1]:
from pathlib import Path
import os


test_parquet_output_dir = "/teamspace/studios/this_studio/plantclef-vision/data/parquet/plantclef2025/full_test"
os.makedirs(test_parquet_output_dir, exist_ok=True)

root = "/teamspace/studios/this_studio/plantclef-vision/data/plantclef2025"
test_image_dir = (
    root + "/competition-metadata/PlantCLEF2025_test_images/PlantCLEF2025_test_images"
)

### Extracting embeddings from single-label training images

We extract embeddings from a small subset of training images to validate our pipeline.  
We don't perform tiling on the train images (we use the full image) and extract 768-dimensional ViT embeddings.

In [None]:
limit_train_df = pd.DataFrame({})

# extract embeddings
embeddings, logits = torch_pipeline(
    limit_train_df,
    batch_size=2,
    use_grid=False,
    cpu_count=1,
)

In [None]:
# embeddings shape
embeddings.shape

In [None]:
# first embedding
embeddings[0][0][:100]  # showing first 100 values out of 768

In [None]:
# create embeddings dataframe
cols = ["image_name", "data", "species", "species_id"]
embeddings_df = limit_train_df[cols].copy()
embeddings_df["embeddings"] = embeddings.tolist()
embeddings_df.head(2)

In [None]:
from plantclef.plotting import plot_images_from_binary

embeddings_df = pd.DataFrame()
plot_images_from_binary(
    embeddings_df,
    data_col="data",
    label_col="species",
    grid_size=(1, 2),
    crop_square=True,
    figsize=(8, 4),
)

In [None]:
from plantclef.plotting import plot_embeddings

plot_embeddings(
    embeddings_df,
    data_col="embeddings",
    label_col="species",
    grid_size=(1, 2),
    figsize=(8, 4),
)

### Embedding test images with tiling (3x3)


Since the test images are high-resolution and contain multiple plant species, we split them into a 3x3 grid of tiles.
- We **extract embeddings** and **top-*K* logits** from each tile using the ViT model.  
- This **patch-wise representation** is critical for enabling multi-label classification.

In [None]:
# set params
USE_GRID = True
GRID_SIZE = 3  # 3x3 grid of tiles
CPU_COUNT = 1  # custom cpu_count
TOP_K = 5  # top-K logits for each tile


test_df = pd.DataFrame({})
test_image_df = pd.DataFrame({})

# select images from test set
image_names = ["CBN-Pyr-03-20230706.jpg", "CBN-can-E6-20230706.jpg"]
test_image_df = test_df[test_df["image_name"].isin(image_names)]

# get embeddings and logits
embeddings, logits = torch_pipeline(
    test_image_df,
    batch_size=2,
    use_grid=USE_GRID,
    grid_size=GRID_SIZE,
    cpu_count=CPU_COUNT,
    top_k=TOP_K,
)

In [None]:
# embeddings shape
embeddings.shape  # (2, 9, 768)

In [14]:
# create embeddings dataframe
def explode_embeddings_logits(
    df: pd.DataFrame,
    embeddings: torch.Tensor,
    logits: list,
    cols: list = ["image_name", "data"],
) -> pd.DataFrame:
    # create dataframe
    pred_df = df[cols].copy()
    pred_df["embeddings"] = embeddings.cpu().tolist()
    pred_df["logits"] = logits
    # explode embeddings
    explode_df = pred_df.explode(["embeddings", "logits"], ignore_index=True)
    # assign tile number for each image
    explode_df["tile"] = explode_df.groupby("image_name").cumcount()
    return explode_df

In [None]:
explode_df = explode_embeddings_logits(test_image_df, embeddings, logits)
explode_df.head(9)

In [None]:
from plantclef.plotting import plot_image_tiles

# show image tiles
plot_image_tiles(
    explode_df,
    data_col="data",
    grid_size=3,
)

In [None]:
from plantclef.plotting import plot_embed_tiles

plot_embed_tiles(
    explode_df,
    data_col="embeddings",
    grid_size=3,
    figsize=(15, 8),
)

In [None]:
# plot grid embeddings
plot_embeddings(
    explode_df,
    data_col="embeddings",
    label_col="tile",
    grid_size=(3, 3),
    figsize=(8, 8),
)

### Analyzing classifier logits per tile

For each tile, we look at the **top predicted species** and associated confidence scores (`logits`).  
This helps interpret how confident the model is in identifying species in each patch.

In [None]:
print(f"Length logits: {len(logits)}")

In [None]:
# display logits of first tile
explode_df["logits"].iloc[0]

In [None]:
# display logits for each tile
for i in range(9):
    logits = explode_df["logits"].iloc[i]
    logits_formatted = {k: round(v, 3) for k, v in logits.items()}
    print(f"Tile {i+1}: {logits_formatted}")

### Embedding the entire test set with tiling

We scale up our embedding pipeline to process the full test dataset using **3x3 tiling**.  
This prepares the data for the downstream tasks of efficient **nearest neighbor search** and **multi-label prediction** at the tile level.

In [None]:
import os

cpu_count = os.cpu_count()
print(f"CPU count: {cpu_count}")

In [None]:
# params
USE_GRID = True
GRID_SIZE = 3  # 3x3 grid of tiles
CPU_COUNT = 1  # custom cpu_count
TOP_K = 5  # top-K logits for each tile

# get embeddings and logits
test_embeddings, test_logits = torch_pipeline(
    test_df,
    batch_size=10,  # 10 imamges per batch
    use_grid=USE_GRID,
    grid_size=GRID_SIZE,
    cpu_count=CPU_COUNT,
    top_k=TOP_K,
)

In [None]:
print(test_embeddings.shape)
print(len(test_logits))

In [25]:
# explode full embeddings and logits
test_explode_df = explode_embeddings_logits(
    test_df,
    test_embeddings,
    test_logits,
)

In [None]:
print(test_explode_df.shape)
test_explode_df.head(9)

In [None]:
plot_embed_tiles(
    test_explode_df,
    data_col="embeddings",
    grid_size=3,
)

### Saving test embeddings and logits to Parquet

We serialize the full test embeddings into partitioned Parquet files for later use in inference pipelines.  
The logits are stored as JSON strings for flexibility.

In [None]:
# def write_embeddings_to_parquet(
#     df: pd.DataFrame,
#     folder_name: str,
#     num_partitions: int = 20,
# ):
#     # path to data
#     root = Path().resolve().parents[0]
#     data_path = f"{root}/data/embeddings"
#     output_path = f"{data_path}/{folder_name}"

#     # remove existing data if it exists to avoid duplication
#     if Path(output_path).exists():
#         shutil.rmtree(output_path, ignore_errors=True)

#     # convert logits to json strings
#     df["logits"] = df["logits"].apply(json.dumps)

#     # assign partition numbers (0 to num_partitions-1)
#     df_size = len(df)
#     df["partition"] = np.repeat(
#         np.arange(num_partitions), np.ceil(df_size / num_partitions)
#     )[:df_size]

#     # write to parquet using the new partition column
#     df.to_parquet(output_path, partition_cols=["partition"], index=False)

#     print(
#         f"Embedding dataset written to: {output_path} with {num_partitions} partitions."
#     )


# # write data
# folder_name = f"test_grid_{GRID_SIZE}x{GRID_SIZE}_embeddings"
# write_embeddings_to_parquet(test_explode_df, folder_name, num_partitions=10)

## Embedding the full training set (no tiling)

We repeat the embedding process on the **full training dataset**, this time *without tiling*.  
This enables us to use the embeddings directly or as a **transfer learning** approach in a Faiss-based nearest neighbor retrieval system.

In [None]:
# params
USE_GRID = False
CPU_COUNT = 1  # custom cpu_count
TOP_K = 5  # top-K logits for each tile

train_df = pd.DataFrame({})

# get embeddings and logits
train_embeddings, train_logits = torch_pipeline(
    train_df,
    batch_size=20,  # 20 imamges per batch
    use_grid=USE_GRID,
    cpu_count=CPU_COUNT,
    top_k=TOP_K,
)

In [None]:
print(train_embeddings.shape)
print(len(train_logits))

In [31]:
# explode full embeddings and logits
train_explode_df = explode_embeddings_logits(
    train_df,
    train_embeddings,
    train_logits,
    cols=["image_name", "data", "species", "species_id"],
)

In [None]:
train_explode_df.head(5)

In [None]:
from plantclef.plotting import plot_single_image_embeddings

plot_single_image_embeddings(
    train_explode_df,
    num_images=2,
    figsize=(8, 10),
)

### Saving the training embeddings to Parquet

Finally, we save the full training embeddings in partitioned Parquet format to support fast, distributed retrieval during inference.

In [None]:
# write data
folder_name = "train_embeddings"
write_embeddings_to_parquet(train_explode_df, folder_name, num_partitions=20)

### Embeddings Ready for Downstream Use

We now have rich ViT embeddings for both train and test datasets, ready for use in:
- Multi-label classification
- Retrieval-based inference
- Nearest Neighbor Search

In [None]:
# path to data
data_path = f"{root}/data/embeddings"
# output_path = f"{data_path}/test_grid_3x3_embeddings"
output_path = f"{data_path}/train_embeddings"

train_emb_df = pd.read_parquet(output_path)
print(train_emb_df.shape)
train_emb_df.head(5)

In [None]:
output_path = f"{data_path}/test_grid_3x3_embeddings"
test_grid_df = pd.read_parquet(output_path)
print(test_grid_df.shape)
test_grid_df.head(5)