# Notebook overview
Generates image embeddings from resized high-resolution (GBIF) images using DINOv2 (timm), saves them as .pt files, and updates the corresponding CSV files.

- Loads a pretrained DINOv2 model and preprocessing pipeline
- Loads resized dataset CSVs and adds tracking columns for embeddings
- Copies available embeddings from a cache when present
- Creates missing embeddings in parallel (ThreadPoolExecutor) and saves .pt files
- Saves updated CSVs;

The notebook was exported as a Python script and run in a console using Tmux to execute it.

## Modul Importe

In [12]:
import pandas as pd
import timm
import torch

import os
import shutil
from typing import Optional
from PIL import Image

from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess

In [None]:
# Number of Threads
MAX_WORKERS = 18

# Folder with CSV files (train, test, val)
CSV_SOURCE_FOLDER_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
if not os.path.exists(CSV_SOURCE_FOLDER_PATH):
    raise FileNotFoundError(f"Folder does not exist: {CSV_SOURCE_FOLDER_PATH}")

# Folder to save adapted CSV files (train, test, val)
CSV_DESTINATION_FOLDER_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created/embedding_resized'
if not os.path.exists(CSV_DESTINATION_FOLDER_PATH):
    raise FileNotFoundError(f"Folder does not exist: {CSV_DESTINATION_FOLDER_PATH}")

# Folder with downloaded images
IMAGE_FOLDER_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/images/adapted/resized/high'
if not os.path.exists(IMAGE_FOLDER_PATH):
    raise FileNotFoundError(f"Folder does not exist: {IMAGE_FOLDER_PATH}")

# Folder to save the embeddings of images
DESTINATION_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/embeddings/adapted/resized/high'

# Cache folder for available embeddings
# CACHE = r'/home/jleick/masterArbeitProjekt/data/ami_embedding/ami_gbif/gbif_fine_grained(dataset_split)'
CACHE = r'/home/'

In [14]:
### Select csv files in given folder Path

def select_files(csv_source_folder_path: str):
    folder_contains = os.listdir(csv_source_folder_path)
    folder_filtered = [] 
    # Filter all relevant files
    for filename in folder_contains:
        ### adapt conditions for specific csv files in folder
        if ('high' in filename): # and 'val' in filename):
            folder_filtered.append(filename)

    return folder_filtered

#call funktion
selected_csv_files = select_files(CSV_SOURCE_FOLDER_PATH)

# Print all selected files
for folder_name in selected_csv_files:
    print(folder_name)

# DUBLICATED CODE (CODE EXIST IN OTHER FILE TOO)

high_id_train.csv
high_id_test.csv
high_ood_test.csv
high_id_val.csv


In [15]:
### Load CSV data into Pandas DataFrame


def load_data(data_path: str, nrows: Optional[int] = None):
    # Load data into Pandas DataFrame
    df = pd.read_csv(data_path, sep=",", header="infer", nrows=nrows)
    # print(f'{df.shape} - Shape of: {data_path.split("/")[-1]}')

    # Print Columns with only NaN in Column
    # empty_columns = df.columns[df.isna().all()]
    # print(f'column: {list(empty_columns)} contains only NaN')

    # Check if df contains duplicated URLs
    if not df["identifier"].is_unique:
        duplicates = df["identifier"].duplicated(keep=False).sum()
        print(f"Duplicate: {duplicates} are included in identifier")
        # raise Exception('Duplicates are included in identifier') # assert url_list.is_unique

    return df


### Test function
# test_path = os.path.join(CSV_SOURCE_FOLDER_PATH, selected_csv_files[1])
# data_temp = load_data(test_path, 5)
# data_temp

### DUBLICATED CODE (CODE EXIST IN OTHER FILE TOO)

In [None]:
### add column to track existing tensors 'tensor_created'


def add_embedding_column_to_df(df: pd.DataFrame):

    column_name_embedding = "embedding_created"
    if column_name_embedding not in df.columns:
        df[column_name_embedding] = False
        print(f'>>> {df.shape} - Added column: {column_name_embedding}')

    column_name_embedding_fail = 'embedding_created_fail'
    if column_name_embedding_fail not in df.columns:
        df[column_name_embedding_fail] = 'NaN'
        print(f'>>> {df.shape} - Added column: {column_name_embedding_fail}')
    
    return column_name_embedding, column_name_embedding_fail


### Test function
# column_name_embedding = add_embedding_column_to_df(data_temp)
# print( f'return: {column_name_embedding}' )
# data_temp

In [17]:
### Load the model
dinov2_vits14 = timm.create_model("vit_small_patch14_dinov2.lvd142m", pretrained=True)
dinov2_vits14.eval()

### Set device (NICHT GETESTET)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device {device}.")

dinov2_vits14.to(device)

### Load preprocessing pipeline
data_cfg = timm.data.resolve_data_config(dinov2_vits14.pretrained_cfg)
transform = timm.data.create_transform(**data_cfg)

type(dinov2_vits14)

### transform settings
# dinov2_vits14.pretrained_cfg
data_cfg # is it usefull to do normalisation and if yes on default values

Training on device cuda.


{'input_size': (3, 518, 518),
 'interpolation': 'bicubic',
 'mean': (0.485, 0.456, 0.406),
 'std': (0.229, 0.224, 0.225),
 'crop_pct': 1.0,
 'crop_mode': 'center'}

In [18]:
### copy availables images from cache to destination


def change_file_extension_to_pt(file_path: str):
    base, ext = os.path.splitext(file_path)
    return base + ".pt"


def copy_availabe_embeddings(
    df: pd.DataFrame, source_dir: str, destination_dir: str, column_name_embedding: str, column_name_embedding_fail: str
):
    for index, image_pfad in df["image_path"].items():
        tensor_pfad = change_file_extension_to_pt(image_pfad)
        source_path = os.path.join(source_dir, tensor_pfad)

        # check if image embedding in cache exists
        if os.path.exists(source_path):
            try:
                destination_path = os.path.join(destination_dir, tensor_pfad)

                # check if image embedding in destination already exists
                if not os.path.exists(destination_path):
                    os.makedirs(os.path.dirname(destination_path), exist_ok=True)
                    shutil.copyfile(source_path, destination_path)

                    df.at[index, column_name_embedding] = True
                    df.at[index, column_name_embedding_fail] = "no error"
                    print(f"Tensor (at index: {index}) found and copied: {destination_path}")
                else:
                    df.at[index, column_name_embedding] = True
                    df.at[index, column_name_embedding_fail] = "no error"
                    print(f"Tensor (at index: {index}) exist already - No copy required : {image_pfad}")
            except Exception as e:
                print(f"Error Image (at index: {index}) copying image {destination_path}: {e}")
        else:
            print(f"Tensor (at index: {index}) not found: {source_path}")


### Test function
# destination_path_update = DESTINATION_PATH + '/' + selected_image_folders[0]
# copy_availabe_embeddings(data_temp, CACHE, destination_path_update, column_name_embedding)

In [19]:
### Create Tensor for images not exist yet
# Embeddings and tensor are used synonymously

def image_to_tensor(
    index: int,
    df: pd.DataFrame,
    model: object,
    transform: object,
    source_dir: str,
    destination_dir: str,
    column_name_embedding: str,
    column_name_embedding_fail: str
):

    if not df.at[index, column_name_embedding]:
        try:
            # load image from source
            image_path = df.at[index, "image_path"]
            source_path = os.path.join(source_dir, image_path)
            image = Image.open(source_path).convert("RGB")

            # transform image to tensor (embeddings)
            image_transformed = transform(image)
            image_transformed = image_transformed.unsqueeze(0).to(device)
            tensor = model(image_transformed)

            # save tensor to destination
            tensor_path = change_file_extension_to_pt(image_path)
            destination_path = os.path.join(destination_dir, tensor_path)
            os.makedirs(os.path.dirname(destination_path), exist_ok=True)
            torch.save(tensor, destination_path)

            # save info into dataFrame
            df.at[index, column_name_embedding] = True
            df.at[index, column_name_embedding_fail] = "no error"
            print(f"Tensor (at index: {index}) created and saved: {destination_path}")
        except Exception as e:
            df.at[index, column_name_embedding_fail] = f'{e}'
            print(f"Error Image (at index: {index}) by creating tensor {image_path}: {e}")
    else:
        print(f"Tensor (at index: {index}) already exists: {df.at[index, 'image_path']}")


### Test function
# destination_path_update = DESTINATION_PATH + '/' + selected_image_folders[0]
# source_dir_update = IMAGE_FOLDER_PATH + '/' + selected_image_folders[0]
# image_to_tensor(0, data_temp, dinov2_vits14, transform, source_dir_update, destination_path_update, "tensor_created")

In [20]:
### Execute download_image function with ThreadPoolExecutor

def image_to_tensor_with_executorpool(
    max_workers: int,
    image_to_tensor: callable,
    df: pd.DataFrame,
    model: object,
    transform: object,
    source_dir: str,
    destination_dir: str,
    column_name_embedding: str,
    column_name_embedding_fail: str
):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for index in df.index:
            if len(futures) > 50:
                for future in as_completed(futures):
                    try:
                        future.result()
                    except Exception as e:
                        print(f"Error while retreving result from future: {e}")
                futures = []

            future = executor.submit(
                image_to_tensor,
                index,
                df,
                model,
                transform,
                source_dir,
                destination_dir,
                column_name_embedding,
                column_name_embedding_fail
            )
            futures.append(future)


### Test function
# destination_path_update = DESTINATION_PATH + '/' + selected_image_folders[0]
# source_dir_update = IMAGE_FOLDER_PATH + '/' + selected_image_folders[0]
# image_to_tensor_with_executorpool(MAX_WORKERS, image_to_tensor, data_temp, dinov2_vits14, transform, source_dir_update, destination_path_update, column_name_embedding)

In [21]:
### RUN - call functions to process file
def process_file(
    csv_load_dir: str,
    csv_save_dir: str,
    model: object,
    transform: object,
    image_source_dir: str,
    embedding_destination_dir: str,
    CACHE: str,
    MAX_WORKERS: int,
):
    print(f"Processing CSV file from: {csv_load_dir}")
    print(f"Saving CSV file to: {csv_save_dir}")
    print(f"Source folder: {image_source_dir}")
    print(f"Destination folder: {embedding_destination_dir}")
    print(f"Cache folder: {CACHE}")
    print("--------------------------------------")

    # process file
    print(">>> Load data and add column to track existing tensors")
    df = load_data(csv_load_dir)
    column_name_embeddings, column_name_embedding_fail = add_embedding_column_to_df(df)

    print(">>> Starting copy availabe tensors from cache to destination")
    copy_availabe_embeddings(
        df, CACHE, embedding_destination_dir, column_name_embeddings, column_name_embedding_fail
    )

    print(f">>> Starting creating tensors with {MAX_WORKERS} workers")
    image_to_tensor_with_executorpool(
        MAX_WORKERS,
        image_to_tensor,
        df,
        model,
        transform,
        image_source_dir,
        embedding_destination_dir,
        column_name_embeddings,
        column_name_embedding_fail
    )

    print(f">>> Save updated Dataframe to: {csv_save_dir}")
    df.to_csv(csv_save_dir, na_rep="NULL", index=False)

    # print(">>> Copy files to cache if not available")
    # subprocess.run(f'rsync -av --ignore-existing {embedding_destination_dir + "/"} {CACHE}', shell=True, check=True)
    # !rsync -av --ignore-existing {DESTINATION} {CACHE}

    print(f">>> FINISH Process: {csv_filename}\n\n")


### call funktions

# matching selected csv file with image folders
for csv_filename in selected_csv_files:

    csv_load_path = os.path.join(CSV_SOURCE_FOLDER_PATH, csv_filename)
    csv_save_path = os.path.join(CSV_DESTINATION_FOLDER_PATH, csv_filename)

    process_file(
        csv_load_path,
        csv_save_path,
        dinov2_vits14,
        transform,
        IMAGE_FOLDER_PATH,
        DESTINATION_PATH,
        CACHE,
        MAX_WORKERS,
    )

Processing CSV file from: /home/jleick/masterArbeitProjekt/final_release/data/datasets/created/high_id_train.csv
Saving CSV file to: /home/jleick/masterArbeitProjekt/final_release/data/datasets/created/embedding_created_copied/high_id_train.csv
Source folder: /home/jleick/masterArbeitProjekt/final_release/data/images/copied/high
Destination folder: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high
Cache folder: /home/
--------------------------------------
>>> Load data and add column to track existing tensors
>>> (85803, 19) - Added column: embedding_created
>>> (85803, 20) - Added column: embedding_created_fail
>>> Starting copy availabe tensors from cache to destination
Tensor (at index: 0) not found: /home/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1024198246.pt
Tensor (at index: 1) not found: /home/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1024215304.pt
Tensor (at index: 2) not found: /home/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1052571037.pt
Tensor (at index: 3) not f

  df.at[index, column_name_embedding_fail] = "no error"
  df.at[index, column_name_embedding_fail] = "no error"


Tensor (at index: 13) created and saved: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1270055675.pt
Tensor (at index: 14) created and saved: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1272091135.pt
Tensor (at index: 2) created and saved: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1052571037.pt
Tensor (at index: 16) created and saved: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1272112929.pt
Tensor (at index: 17) created and saved: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1272117528.pt
Tensor (at index: 0) created and saved: /home/jleick/masterArbeitProjekt/final_release/data/embeddings/copied/high/50c9509d-22c7-4a22-a47d-8c48425ef4a7/1024198246.pt


KeyboardInterrupt: 