In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install -q pytorch-lightning
!pip install -q pytorch-metric-learning

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!git clone https://github.com/murilogustineli/pytorch-plantclef.git

Cloning into 'pytorch-plantclef'...
remote: Enumerating objects: 399, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 399 (delta 109), reused 80 (delta 68), pack-reused 234 (from 1)[K
Receiving objects: 100% (399/399), 103.53 MiB | 38.09 MiB/s, done.
Resolving deltas: 100% (217/217), done.


In [None]:
import sys
sys.path.append('/content/pytorch-plantclef/plantclef')
from config import get_device, get_class_mappings_file
from serde import deserialize_image
import timm
import torch
import pytorch_lightning as pl
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import time
import gc

from functools import partial
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from pytorch_metric_learning.samplers import MPerClassSampler

# model .tar path
MODEL_PATH = '/content/drive/MyDrive/01_plantclef_datasets/models/pretrained_models/vit_base_patch14_reg4_dinov2_lvd142m_pc24_onlyclassifier_then_all/vit_base_patch14_reg4_dinov2_lvd142m_pc24_onlyclassifier_then_all/model_best.pth.tar'
TRAIN_PATH = "/content/drive/MyDrive/01_plantclef_datasets/parquet/family_02/"
TEST_PATH = "/content/drive/MyDrive/01_plantclef_datasets/parquet/df_test.parquet"

FAMILY_PATH = "/content/drive/MyDrive/01_plantclef_datasets/parquet/family_02/"
OUTPUT_EMBEDDING_PATH = "/content/drive/MyDrive/01_plantclef_datasets/embeddings/embedding_batches/"


ModuleNotFoundError: No module named 'config'

### Custom Dataset/Dataloader

In [None]:
def custom_collate_fn(batch, use_grid):
    """Custom collate function to handle batched grid images properly."""
    if use_grid:
        return torch.stack(batch, dim=0)  # shape: (B, grid_size**2, C, H, W)
    return torch.stack(batch)  # shape: (B, C, H, W)

def custom_collate_fn_partial(use_grid):
    """Returns a pickle-friendly collate function with the `use_grid` flag."""
    return partial(custom_collate_fn, use_grid=use_grid)

class PlantDataset(Dataset):

  def __init__(
      self,
      df,
      transform,
      col_name: str = "image_binary_rep",
      use_grid: bool = False,
      grid_size: int = 4,
  ):

    self.df = df
    self.transform = transform
    self.col_name = col_name
    self.use_grid = use_grid
    self.grid_size = grid_size

  def __len__(self):
    return len(self.df)

  def _split_into_grid(self, image):
      w, h = image.size
      grid_w, grid_h = w // self.grid_size, h // self.grid_size
      images = []
      for i in range(self.grid_size):
          for j in range(self.grid_size):
              left = i * grid_w
              upper = j * grid_h
              right = left + grid_w
              lower = upper + grid_h
              crop_image = image.crop((left, upper, right, lower))
              images.append(crop_image)
      return images

  def __getitem__(self, idx) -> list:
      img_bytes = self.df.iloc[idx][self.col_name]  # column with image bytes
      img = deserialize_image(img_bytes)  # convert from bytes to PIL image

      if self.use_grid:
          img_list = self._split_into_grid(img)
          if self.transform:
              img_list = [self.transform(image) for image in img_list]
          else:  # no transform, shape: (grid_size**2, C, H, W)
              img_list = [ToTensor()(image) for image in img_list]
          return torch.stack(img_list)
      # single image, shape: (C, H, W)
      if self.transform:
          return self.transform(img)  # (C, H, W)
      return ToTensor()(img)  # (C, H, W)

class PlantDataModule(pl.LightningDataModule):
    """LightningDataModule for handling dataset loading and preparation."""

    def __init__(
        self,
        pandas_df,
        batch_size=32,
        use_grid=False,
        grid_size=4,
        num_workers=4,
    ):
        super().__init__()
        self.pandas_df = pandas_df
        self.batch_size = batch_size
        self.use_grid = use_grid
        self.grid_size = grid_size
        self.num_workers = num_workers

    def setup(self, stage=None):
        """Set up dataset and transformations."""

        self.model = DINOv2LightningModel()
        self.dataset = PlantDataset(
            self.pandas_df,
            self.model.transform,  # Use the model's transform
            use_grid=self.use_grid,
            grid_size=self.grid_size,
        )

    def predict_dataloader(self):
        """Returns DataLoader for inference."""
        return DataLoader(
            self.dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            persistent_workers=True,
            collate_fn=custom_collate_fn_partial(self.use_grid),
        )

### Definition of DINOv2 Model

In [None]:
class DINOv2LightningModel(pl.LightningModule):

  def __init__(
      self,
      model_path: str = MODEL_PATH,
      model_name: str = "vit_base_patch14_reg4_dinov2.lvd142m",
      top_k: int = 10 # Return the first K predictions given an image
  ):
      super().__init__()
      self.model_device = get_device()
      self.num_classes = 7806  # total plant species
      self.top_k = top_k

      self.model = timm.create_model(
          model_name,
          pretrained=False,
          num_classes=self.num_classes,
          checkpoint_path=model_path,
      )

      # High level preprocessing transformations with lightining
      self.data_config = timm.data.resolve_model_data_config(self.model)
      self.transform = timm.data.create_transform(
          **self.data_config, is_training=False
      )

      self.model.to(self.model_device)
      self.model.eval()
      # class mappings file for classification
      self.class_mappings_file = get_class_mappings_file()
      self.cid_to_spid = self._load_class_mappings()

  def _load_class_mappings(self):
    with open(self.class_mappings_file, "r") as f:
      class_index_to_class_name = {i: line.strip() for i, line in enumerate(f)}
    return class_index_to_class_name

  def forward(self, batch):
      """Extract embeddings using the [CLS] token."""
      with torch.no_grad():
          batch = batch.to(self.model_device)  # move to device

          if batch.dim() == 5:  # (B, grid_size**2, C, H, W)
              B, G, C, H, W = batch.shape
              batch = batch.view(B * G, C, H, W)  # (B * grid_size**2, C, H, W)
          # forward pass
          features = self.model.forward_features(batch)
          embeddings = features[:, 0, :]  # extract [CLS] token
          logits = self.model(batch)

      return embeddings, logits

  def predict_step(self, batch, batch_idx):
      """Runs inference on batch and returns embeddings and top-K logits."""
      embeddings, logits = self(batch)
      probabilities = torch.softmax(logits, dim=1)
      top_probs, top_indices = torch.topk(probabilities, k=self.top_k, dim=1)

      # map class indices to species names
      batch_logits = []
      for i in range(len(logits)):
          species_probs = {
              self.cid_to_spid.get(int(top_indices[i, j].item()), "Unknown"): float(
                  top_probs[i, j].item()
              )
              for j in range(self.top_k)
          }
          batch_logits.append(species_probs)

      return embeddings, batch_logits

### Prediction Pipeline

In [None]:
def torch_pipeline(
    pandas_df: pd.DataFrame,
    batch_size: int = 32,
    use_grid: bool = False,
    grid_size: int = 1,
    cpu_count: int = 1,
    top_k: int = 5,
):
    """Pipeline to extract embeddings and top-K logits using PyTorch Lightning."""

    # initialize model
    model = DINOv2LightningModel(top_k=top_k)

    # create Dataset
    dataset = PlantDataset(
        pandas_df,
        model.transform,
        use_grid=use_grid,
        grid_size=grid_size,
    )
    # create DataLoader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=cpu_count,
        collate_fn=custom_collate_fn_partial(use_grid),  # pickle-friendly collate_fn
    )
    # run inference and collect embeddings with tqdm progress bar
    all_embeddings = []
    all_logits = []
    for batch in tqdm(
        dataloader, desc="Extracting embeddings and logits", unit="batch"
    ):
        embeddings, logits = model.predict_step(
            batch, batch_idx=0
        )  # batch: List[Tuple[embeddings, logits]]

        embeddings_cpu = embeddings.cpu()
        all_embeddings.append(embeddings_cpu)  # keep embeddings as tensors

        logits = [
            logits[i : i + grid_size**2] for i in range(0, len(logits), grid_size**2)
        ]
        all_logits.extend(logits)  # preserve batch structure

        # to avoid GPU OutOfMemoryError
        del embeddings, embeddings_cpu, batch
        torch.cuda.empty_cache()
        gc.collect()


    # convert embeddings to tensor
    embeddings = torch.cat(all_embeddings, dim=0)  # shape: [len(df), grid_size**2, 768]

    if use_grid:
        embeddings = embeddings.view(-1, grid_size**2, 768)
    else:
        embeddings = embeddings.view(-1, 1, 768)

    return embeddings, all_logits

def pl_trainer_pipeline(
    pandas_df: pd.DataFrame,
    batch_size: int = 32,
    use_grid: bool = False,
    grid_size: int = 1,
    cpu_count: int = 1,
    top_k: int = 5,
):
    """Pipeline to extract embeddings and top-k logits using PyTorch Lightning."""

    # initialize DataModule
    data_module = PlantDataModule(
        pandas_df,
        batch_size=batch_size,
        use_grid=use_grid,
        grid_size=grid_size,
        num_workers=cpu_count,
    )

    # initialize Model
    model = DINOv2LightningModel(top_k=top_k)

    # define Trainer (inference mode)
    trainer = pl.Trainer(
        accelerator=get_device(),
        devices=1,
        enable_progress_bar=True,
    )

    # run Inference
    predictions = trainer.predict(model, datamodule=data_module)

    all_embeddings = []
    all_logits = []
    for batch in predictions:
        embed_batch, logits_batch = batch  # batch: List[Tuple[embeddings, logits]]
        all_embeddings.append(embed_batch)  # keep embeddings as tensors
        reshaped_logits = [
            logits_batch[i : i + grid_size**2]
            for i in range(0, len(logits_batch), grid_size**2)
        ]
        all_logits.extend(reshaped_logits)  # preserve batch structure

    # convert embeddings to tensor
    embeddings = torch.cat(all_embeddings, dim=0)  # shape: [len(df), grid_size**2, 768]

    if use_grid:
        embeddings = embeddings.view(-1, grid_size**2, 768)
    else:
        embeddings = embeddings.view(-1, 1, 768)

    return embeddings, all_logits

### Concatenate family parquet files to extract embeddings

In [None]:
# import psutil

# def print_mem():
#     mem = psutil.virtual_memory()
#     print(f"Memória usada: {mem.used / 1024**3:.2f} GB / {mem.total / 1024**3:.2f} GB")

In [None]:
# processed_data = set()

# if os.path.exists(OUTPUT_EMBEDDING_PATH):
#   output_files = os.listdir(OUTPUT_EMBEDDING_PATH)

#   if output_files:
#     for file in os.listdir(OUTPUT_EMBEDDING_PATH):

#       if file.endswith(".feather"):
#         name = file.split('.')[0]
#         processed_data.add(name)

# df_list = []

# families = sorted(os.listdir(FAMILY_PATH))
# for family in families:

#   if not family.endswith(".parquet"):
#     continue

#   name = family.split('.')[0]
#   if name in processed_data:
#     print(f"{name} already processed. Skipping.")
#     continue

#   start_time = time.perf_counter()
#   file_path = os.path.join(FAMILY_PATH, family)

#   try:
#     df = pd.read_parquet(file_path)
#     print_mem()
#   except Exception as e:
#     print(f"Could not read {file_path}: {e}")
#     continue
#   end_time = time.perf_counter()

#   time_consumed = end_time - start_time
#   print(f"Succeed reading '{family}'.parquet. Time consumed: {time_consumed:.2f} s | df.shape = {df.shape}")

#   df_list.append(df)

# if df_list:
#     final_df = pd.concat(df_list, ignore_index=True)
#     print(f"Concat success. final_df.shape = {final_df.shape}")
# else:
#     print("No new DataFrames to concatenate.")

### Train/test prep for embeddings extraction 090425

In [None]:
# df = pd.read_feather(TRAIN_PATH)

In [None]:
# test_df = pd.read_parquet(TEST_PATH)

In [None]:
# print(df.shape, test_df.shape)

In [None]:
# micro teste: subset
# amostra_df = df.sample(n=250, random_state=42)
# amostra_test_df = test_df.sample(n=20, random_state=42)
# print(amostra_df.shape, amostra_test_df.shape)
# df.shape

In [None]:
# params
USE_GRID = True
GRID_SIZE = 3
TOP_K = 5

In [None]:
# Train Extraction
import gc
for i in range(32,36,1):
  df = pd.read_feather(f"{TRAIN_PATH}batch_{i}.feather")
  embeddings_train, logits_train = torch_pipeline(df,
                                      batch_size=400,
                                      use_grid=False,
                                      grid_size=1,
                                      top_k=TOP_K
                                      )
  cols = ["image_name", "species", "species_id"]
  embeddings_df = df[cols].copy()
  embeddings_df["embeddings"] = embeddings_train.tolist()
  embeddings_df['logits'] = logits_train
  embeddings_df.to_parquet(f"{OUTPUT_EMBEDDING_PATH}embeddings_{i}.parquet", index=False)
  print(f"embeddings_{i}.parquet saved")
  del df
  del embeddings_df
  gc.collect()

Extracting embeddings and logits: 100%|██████████| 50/50 [12:06<00:00, 14.52s/batch]


embeddings_32.parquet saved


Extracting embeddings and logits: 100%|██████████| 50/50 [12:06<00:00, 14.52s/batch]


embeddings_33.parquet saved


Extracting embeddings and logits: 100%|██████████| 50/50 [12:06<00:00, 14.53s/batch]


embeddings_34.parquet saved


Extracting embeddings and logits: 100%|██████████| 30/30 [07:11<00:00, 14.37s/batch]


embeddings_35.parquet saved


In [None]:
# Test Extraction
gc.collect()

# embeddings_test, logits_test = torch_pipeline(test_df,
#                                     batch_size=50,
#                                     use_grid=USE_GRID,
#                                     grid_size=GRID_SIZE,
#                                     top_k=TOP_K
#                                     )

8839

In [None]:
# parquet save file for train

cols = ["image_name", "species", "species_id"]
embeddings_df = df[cols].copy()
embeddings_df["embeddings"] = embeddings_train.tolist()
embeddings_df['logits'] = logits_train

In [None]:
embeddings_df

Unnamed: 0,image_name,species,species_id,embeddings,logits
540683,7e89ae7752c641951fa16cf64cc8a0942ea8dda5.jpg,Betula pendula Roth,1356609,"[[-0.47420984506607056, 0.41945114731788635, -...","[{'1356609': 0.6057543754577637, '1362746': 0...."
553034,e1046198dd346ab52b1a5a358649dffed15a559b.jpg,Tradescantia virginiana L.,1396824,"[[0.39217692613601685, 0.4924355745315552, -0....","[{'1396824': 0.8411104083061218, '1434347': 0...."
194024,8245587506491d475e13696f68a761581acd2b74.jpg,Teucrium montanum L.,1359006,"[[-0.5446397066116333, -0.1186700388789177, -0...","[{'1359006': 0.3849484622478485, '1362745': 0...."
899986,fce822e7771addcc3497d0478c253cfd73f82f48.jpg,Milium effusum L.,1393828,"[[-0.15058580040931702, 0.3915726840496063, -0...","[{'1393828': 0.34340548515319824, '1363933': 0..."
1303067,15a3d48343047fcdca7fea3293c98ae909d72da2.jpg,Opuntia elatior Mill.,1362754,"[[0.3651646673679352, 0.5237950086593628, -0.1...","[{'1362754': 0.6536065936088562, '1399848': 0...."
...,...,...,...,...,...
818600,e5b2329d57d36b67ece14755d15b5ae5f6a2a08e.jpg,Hieracium glaucinum Jord.,1392931,"[[0.673701286315918, 0.45318055152893066, -0.4...","[{'1392931': 0.1625651866197586, '1392973': 0...."
1198880,2437f76f461d858adf41678b989f39bf9b5baf2d.jpg,Diplotaxis harra (Forssk.) Boiss.,1363876,"[[0.387522429227829, 1.1748783588409424, -0.06...","[{'1363876': 0.5754228234291077, '1358260': 0...."
880830,3c32f878f2719d0b5b668ad2f6bbe37e3da101e9.jpg,Cistus laurifolius L.,1357104,"[[0.7272875905036926, 0.621255099773407, 0.411...","[{'1357104': 0.7336958646774292, '1357096': 0...."
984728,e30aee33eafa956ffe2cb533092366a61077bb14.jpg,Xeranthemum annuum L.,1397241,"[[0.6178438663482666, -0.015458069741725922, 1...","[{'1397241': 0.6673569083213806, '1363110': 0...."


In [None]:
# parquet save file for test
def explode_embeddings_logits(df: pd.DataFrame,
                             embeddings: torch.Tensor,
                             logits: list,
                             cols: list = ['quadrat_id']) -> pd.DataFrame:
    pred_df = df[cols].copy()
    pred_df['embeddings'] = embeddings.tolist()
    pred_df['logits'] = logits

    # Explode embeddings
    explode_df = pred_df.explode(["embeddings", "logits"], ignore_index=True)
    # assign tile number for each image
    explode_df["tile"] = explode_df.groupby("quadrat_id").cumcount()
    return explode_df

explode_df = explode_embeddings_logits(test_df, embeddings_test, logits_test)


Unnamed: 0,quadrat_id,embeddings,logits,tile
0,CBN-PdlC-E3-20130723,"[1.7311939001083374, 1.790383219718933, 0.1066...","{'1722440': 0.19948247075080872, '1392323': 0....",0
1,CBN-PdlC-E3-20130723,"[1.2909488677978516, 2.2347311973571777, -0.95...","{'1394911': 0.04262404143810272, '1741903': 0....",1
2,CBN-PdlC-E3-20130723,"[1.7716916799545288, 1.0189008712768555, -2.09...","{'1396253': 0.7978145480155945, '1363741': 0.0...",2
3,CBN-PdlC-E3-20130723,"[1.507507085800171, 2.4123475551605225, -0.713...","{'1392323': 0.2659570574760437, '1722440': 0.2...",3
4,CBN-PdlC-E3-20130723,"[1.8004498481750488, 1.6963614225387573, -0.99...","{'1394597': 0.14452438056468964, '1394908': 0....",4
5,CBN-PdlC-E3-20130723,"[2.6754066944122314, 1.7712541818618774, -1.07...","{'1393824': 0.1171049103140831, '1397513': 0.0...",5
6,CBN-PdlC-E3-20130723,"[0.8207782506942749, 0.759128987789154, 0.0050...","{'1722440': 0.35720178484916687, '1392732': 0....",6
7,CBN-PdlC-E3-20130723,"[2.1328089237213135, 1.3362785577774048, 1.162...","{'1722440': 0.29147034883499146, '1361372': 0....",7
8,CBN-PdlC-E3-20130723,"[1.561506986618042, 1.4456361532211304, 0.1516...","{'1396253': 0.173810213804245, '1394908': 0.03...",8


In [None]:
embeddings_df.to_parquet(f"{OUTPUT_EMBEDDING_PATH}batch_01.parquet", index=False)

In [None]:
# explode_df.to_parquet("explode_test_embeddings", index=False)

###  Organ balanced subsample embedding extraction  

In [None]:
all_organs = {'leaf', 'fruit', 'habit', 'flower', 'bark', 'scan', 'branch'}

# species with all possible organs
samples = (
    df.groupby('gbif_species_id')['organ']
      .agg(set)
      .loc[lambda x: x.apply(lambda organs: all_organs.issubset(organs))]
      .index
      .tolist()
)

In [None]:
def create_organ_subset(df: pd.DataFrame,
                        species_list: list,
                        expected_organs: set,
                        random_state: int=42) -> pd.DataFrame:

    rng = np.random.default_rng(random_state)
    selecionadas = []

    for especie in species_list:
        df_especie = df[df['gbif_species_id'] == especie]

        for organ in expected_organs:
            df_organ = df_especie[df_especie['organ'] == organ]
            if df_organ.empty:
                raise ValueError(f"A espécie {especie} não possui imagem do órgão '{organ}'.")

            # Escolhe uma imagem aleatória desse órgão
            idx_escolhido = rng.choice(df_organ.index, size=1)[0]
            selecionadas.append(idx_escolhido)

    # Retorna o subset original apenas com os índices escolhidos
    return df.loc[selecionadas].reset_index(drop=True)

subset = create_organ_subset(df,
                    samples,
                    all_organs,
                    random_state=42)

In [None]:
dataframe = subset.copy()

embeddings, logits = torch_pipeline(dataframe,
                                    batch_size=7,
                                    use_grid=False,
                                    grid_size=1,
                                    )

Extracting embeddings and logits: 100%|██████████| 34/34 [00:20<00:00,  1.66batch/s]


In [None]:
cols = ["image_name", "organ", "image_binary_rep", "species", "species_id"]
embeddings_df = dataframe[cols].copy()
embeddings_df["embeddings"] = embeddings.tolist()


Unnamed: 0,image_name,organ,image_binary_rep,species,species_id,embeddings
0,2af27e02438a509d2db95925112b3eeed165d8c7.jpg,habit,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Dioscorea communis (L.) Caddick & Wilkin,1361109,"[[0.9028947949409485, 0.4852701425552368, 0.22..."
1,ed33de2e02381df18b3b2c0e8433e76567d9006f.jpg,branch,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Dioscorea communis (L.) Caddick & Wilkin,1361109,"[[0.09231726080179214, 0.5111350417137146, -0...."
2,bf52ea545e48b3b6db06fb7783abcebaaa035748.jpg,fruit,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Dioscorea communis (L.) Caddick & Wilkin,1361109,"[[0.7255693078041077, 0.8429560661315918, -0.1..."
3,92fe5c87b4f531f6b5e18fc6956f9c549202a48a.jpg,flower,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Dioscorea communis (L.) Caddick & Wilkin,1361109,"[[0.4683719575405121, 1.1044397354125977, 0.59..."
4,439c0c3ffbdbdc945c17d9d8583c8a1cb1057972.jpg,leaf,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Dioscorea communis (L.) Caddick & Wilkin,1361109,"[[0.9619735479354858, 0.13999642431735992, 0.3..."
...,...,...,...,...,...,...
233,c73f3a1799640cb2d9a4498f0449fc96d143de12.jpg,fruit,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Pittosporum tobira (Thunb.) W.T.Aiton,1394624,"[[0.7069684863090515, 0.4889654815196991, 0.24..."
234,aba375ff0ce3962a03be13332aa1f3a5dd0afbcf.jpg,flower,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Pittosporum tobira (Thunb.) W.T.Aiton,1394624,"[[0.6098253130912781, 0.4148502051830292, 0.77..."
235,00e8c7e402f43f6b5936e090e8c1c70f7d09eb8b.jpg,leaf,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Pittosporum tobira (Thunb.) W.T.Aiton,1394624,"[[0.3717259466648102, 0.49665042757987976, 0.1..."
236,5a9bf2ba9c5e8604937e28b7242136183f6bc0fb.jpg,bark,b'\xff\xd8\xff\xdb\x00C\x00\x06\x04\x05\x06\x0...,Pittosporum tobira (Thunb.) W.T.Aiton,1394624,"[[0.5970432162284851, 0.5912252068519592, 0.23..."


In [None]:
embeddings_df.to_csv("Embeddings_all_organs_040425.csv")