# Notebook overview
Loads AMI-T (trap) images from webdataset (.tar) shards, saves decoded images and metadata as a CSV.

- Locates and loads all webdataset shard files
- Decodes images and JSON annotations, saves images to a destination folder
- Builds a DataFrame of image metadata and saves it as a CSV

The notebook was exported as a Python script and run in a console using Tmux to execute it. The notebook was used for both datasets (Binary and Fine-Grained) just adapte the paths.

# Preperations

### Imports

In [19]:
import pandas as pd
import timm
import torch
import PIL

import os

import webdataset as wds

from concurrent.futures import ThreadPoolExecutor, as_completed

### Paths - wds_file, img_dir_destination_path, df_destination_path

In [20]:
# source of webdataset (Path to the first Webdataset file .../binary-000000.tar)
# wds_file = r'/home/jleick/masterArbeitProjekt/data/ami_dataset/ami_traps/webdataset/binary_classification/binary-000000.tar'
wds_file = r'/home/jleick/masterArbeitProjekt/data/ami_dataset/ami_traps/webdataset/fine-grained_classification/fgrained-000000.tar'
if not os.path.exists(wds_file):
    raise FileNotFoundError('Webdataset file does not exist')

# destination path image (Path to the folder in which the images are stored)
# img_dir_destination_path = r'/home/jleick/masterArbeitProjekt/final_release/data/images/download/low/low_binary'
img_dir_destination_path = r'/home/jleick/masterArbeitProjekt/final_release/data/images/download/low/low_fine_grain'
if not os.path.exists(img_dir_destination_path):
    os.makedirs(img_dir_destination_path)

# destination path df (Path to save the created df)
# df_destination_path = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/trap/traps_binary_img.csv'
df_destination_path = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/trap/traps_fine_grain_img.csv'

# Functions

### Function - webdatasets_paths_in_folder

In [None]:
# save all paths of available webdataset in given folder path - start by the first file number binary-000000.tar ...

def webdatasets_paths_in_folder(wds_file: str) -> tuple[list[str], str]: 
    folder = os.path.dirname(wds_file)
    file_name = os.path.basename(wds_file)
    _, extension = os.path.splitext(file_name)
    base_name = file_name.split(sep='-')[0]

    datasets_paths = []
    for i in range(999999):
        current_file_name = f"{base_name}-{i:06d}{extension}"
        current_file_path = os.path.join(folder, current_file_name)
        if os.path.exists(current_file_path):
            dataset_absolut_path = os.path.join(folder, current_file_path)
            datasets_paths.append(dataset_absolut_path)
            print(f"found webdataset file: {current_file_name}")
        else:
            return datasets_paths, base_name

### Function - load_webdatasets

In [22]:
# load webdatasets from list with paths to .tar archives/webdatasets into list
def load_webdatasets(datasets_paths: list[str]) -> list[wds.WebDataset]:
    webdatasets_loaded = []

    for dataset_path in datasets_paths:
        dataset = (wds.WebDataset(dataset_path, shardshuffle=False)
                .decode("pil")
                .to_tuple("jpg", "json"))
        webdatasets_loaded.append(dataset)
        print(f"Webdataset loaded: {os.path.basename(dataset_path)}")
    
    return webdatasets_loaded


### call function
# webdataset_paths, base_name = webdatasets_paths_in_folder(wds_file)
# webdatasets_loaded = load_webdatasets(webdataset_paths)

### Function - process_webdatasets

In [None]:
def process_webdatasets(webdatasets_loaded, img_dir_destination_path, file_name):
    img_description_list = []
    index = 0

    for dataset in webdatasets_loaded:
        for image, annotation in dataset:

            path_image = os.path.join(img_dir_destination_path, f"{file_name}-{index}.png")
            image.save(path_image)

            img_description_list.append({"index": index, "identifier": f"{file_name}-{index}.png" , **annotation})

            index += 1
            print(f"Image: {file_name}-{index}.png processed")

    img_description_list_sorted = sorted(img_description_list, key=lambda x: x['index'])
    df = pd.DataFrame(img_description_list_sorted)
    return df

# Run

In [None]:
# RUN

print(f'>>> Select .tar files in given folder')
webdataset_paths, base_name = webdatasets_paths_in_folder(wds_file)

print(f'>>> Load webdatasets')
webdatasets_loaded = load_webdatasets(webdataset_paths)

print(f'>>> Process webdatasets, save images and create df')
df = process_webdatasets(webdatasets_loaded, img_dir_destination_path, base_name)

print(f'>>> Save created dataFrame to: {df_destination_path}')
df.to_csv(df_destination_path, index=False, header=True)

>>> Select .tar files in given folder
found webdataset file: fgrained-000000.tar
found webdataset file: fgrained-000001.tar
found webdataset file: fgrained-000002.tar
found webdataset file: fgrained-000003.tar
found webdataset file: fgrained-000004.tar
found webdataset file: fgrained-000005.tar
>>> Load webdatasets
Webdataset loaded: fgrained-000000.tar
Webdataset loaded: fgrained-000001.tar
Webdataset loaded: fgrained-000002.tar
Webdataset loaded: fgrained-000003.tar
Webdataset loaded: fgrained-000004.tar
Webdataset loaded: fgrained-000005.tar
>>> Process webdatasets, save images and create df
>>> Save created dataFrame to:
