# Notebook overview
Copies images referenced in the created datasets from the fully downloaded AMI dataset to local destination folders and updates the corresponding CSV files.

- Handles training, validation and multiple test splits (FG, FG + DG, ND, ND + DG)
- Adds a boolean column to indicate copied images and saves updated CSVs

The notebook was exported as a Python script and run in a console using Tmux to execute it.

# Preperation

### Import

In [68]:
import pandas as pd
from pathlib import Path
import shutil

### Path - df_source_dir

In [69]:
DF_SOURCE_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created'
df_source_dir = Path(DF_SOURCE_PATH)

### Path - df_destination_dir

In [None]:
DF_DESTINATION_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/created/copied' # .../created/image_original
df_destination_dir = Path(DF_DESTINATION_PATH)

### Load df - high

In [None]:
high_id_train = pd.read_csv(df_source_dir / 'high_id_train.csv', index_col=0)
high_id_val = pd.read_csv(df_source_dir / 'high_id_val.csv', index_col=0)
high_id_test = pd.read_csv(df_source_dir / 'high_id_test.csv', index_col=0)

high_ood_test = pd.read_csv(df_source_dir / 'high_ood_test.csv', index_col=0)

### Load df - low

In [72]:
### Unexplained problems: the index was not unique, even though the rows were. Bug?
### I checked copy_images.ipynb, but there was some strange behaviour there too.
low_id_test = pd.read_csv(df_source_dir / 'low_id_test.csv', index_col=0)
low_id_test.reset_index(drop=True,inplace=True)

low_ood_test = pd.read_csv(df_source_dir / 'low_ood_test.csv', index_col=0)
low_ood_test.reset_index(drop=True,inplace=True)

### Path - img_high_source_dir

In [73]:
IMG_HIGH_SOURCE_DIR_PATH = r'/home/jleick/masterArbeitProjekt/data/ami_images_temp/ami_gbif/cached_images'
img_high_source_dir = Path(IMG_HIGH_SOURCE_DIR_PATH)

### Path - img_high_destination_dir

In [74]:
IMG_HIGH_DESTINATION_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/images/download/high'
img_high_destination_dir = Path(IMG_HIGH_DESTINATION_DIR_PATH)

### Path - img_low_source_dir

In [75]:
IMG_LOW_SOURCE_DIR_PATH = r'/home/jleick/masterArbeitProjekt/data/ami_images/ami_traps/fine-grained_classification'
img_low_source_dir = Path(IMG_LOW_SOURCE_DIR_PATH)

### Path - img_low_destination_dir

In [76]:
IMG_LOW_DESTINATION_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/images/download/low'
img_low_destination_dir = Path(IMG_LOW_DESTINATION_DIR_PATH)

# Function

### Function - add_boolean_col_to_df

In [77]:
def add_boolean_col_to_df(df: pd.DataFrame, col_name: str):
    df_return = df.copy()

    if col_name not in df.columns:
        df_return[col_name] = False
        print(f'>>> {df_return.shape} - Added column: {col_name}')
    
    return df_return

### Function - copy_availabe_images

In [78]:
def copy_availabe_images(df: pd.DataFrame, source_dir: Path, destination_dir: Path, col_name: str):
    df_return = df.copy()
    for index in df_return.index:
        image_pfad = df_return.at[index, 'image_path']
        print(image_pfad)
        print(type(image_pfad))
        source_path = source_dir / image_pfad

        if source_path.exists():
            try:
                destination_path = destination_dir / image_pfad
                if not destination_path.exists():
                    destination_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copyfile(source_path, destination_path)

                    df_return.at[index, col_name] = True
                    print(f'Image (at index: {index}) found and copied: {image_pfad}')
                else:
                    df_return.at[index, col_name] = True
                    print(f'Image (at index: {index}) exist already - no copy necessary : {image_pfad}')
            except Exception as e:
                print(f'Error Image (at index: {index}) copying image {image_pfad}: {e}')
        else:
            print(f"Image (at index: {index}) not found: {source_path}")        
    return df_return

# Image copy

### Variable - col_name

In [79]:
col_name = 'image_copied'

## high

### Apply - add_boolean_col_to_df

In [80]:
high_id_train = add_boolean_col_to_df( high_id_train, col_name)
high_id_val = add_boolean_col_to_df( high_id_val, col_name)
high_id_test = add_boolean_col_to_df( high_id_test, col_name)

high_ood_test = add_boolean_col_to_df( high_ood_test, col_name)

>>> (7213, 19) - Added column: image_copied


### Apply - copy_availabe_images

In [None]:
high_id_train = copy_availabe_images( high_id_train, img_high_source_dir, img_high_destination_dir, col_name )
high_id_val = copy_availabe_images( high_id_val, img_high_source_dir, img_high_destination_dir, col_name )
high_id_test = copy_availabe_images( high_id_test, img_high_source_dir, img_high_destination_dir, col_name )

high_ood_test = copy_availabe_images( high_ood_test, img_high_source_dir, img_high_destination_dir, col_name )

50c9509d-22c7-4a22-a47d-8c48425ef4a7/1265569795.jpg
<class 'str'>
Image (at index: 169) exist already - no copy necessary : 50c9509d-22c7-4a22-a47d-8c48425ef4a7/1265569795.jpg
50c9509d-22c7-4a22-a47d-8c48425ef4a7/1453464650.jpg
<class 'str'>
Image (at index: 435) exist already - no copy necessary : 50c9509d-22c7-4a22-a47d-8c48425ef4a7/1453464650.jpg
50c9509d-22c7-4a22-a47d-8c48425ef4a7/1500337474.jpg
<class 'str'>
Image (at index: 510) exist already - no copy necessary : 50c9509d-22c7-4a22-a47d-8c48425ef4a7/1500337474.jpg
50c9509d-22c7-4a22-a47d-8c48425ef4a7/1831167525.jpg
<class 'str'>
Image (at index: 880) exist already - no copy necessary : 50c9509d-22c7-4a22-a47d-8c48425ef4a7/1831167525.jpg
cca13f2c-0d2c-4c2f-93b9-4446c0cc1629/1846837462.jpg
<class 'str'>
Image (at index: 1178) exist already - no copy necessary : cca13f2c-0d2c-4c2f-93b9-4446c0cc1629/1846837462.jpg
50c9509d-22c7-4a22-a47d-8c48425ef4a7/1847478030.jpg
<class 'str'>
Image (at index: 1995) exist already - no copy necess

### Save df

In [None]:
high_id_train.to_csv( df_destination_dir/'high_id_train.csv')
high_id_val.to_csv( df_destination_dir/'high_id_val.csv')
high_id_test.to_csv( df_destination_dir/'high_id_test.csv')

high_ood_test.to_csv( df_destination_dir/'high_ood_test.csv')

## low

### Apply - add_boolean_col_to_df

In [83]:
low_id_test = add_boolean_col_to_df( low_id_test, col_name)

low_ood_test = add_boolean_col_to_df( low_ood_test, col_name)

>>> (2138, 8) - Added column: image_copied
>>> (698, 8) - Added column: image_copied


### rename file_name sufix to .png

In [49]:
low_id_test["identifier"] = low_id_test["identifier"].str.replace(".ts", ".png", regex=False)

low_ood_test["identifier"] = low_ood_test["identifier"].str.replace(".ts", ".png", regex=False)

### rename col_name to image_path

In [50]:
low_id_test = low_id_test.rename(columns={"identifier": "image_path"})

low_ood_test = low_ood_test.rename(columns={"identifier": "image_path"})

### Apply - copy_availabe_images

In [51]:
low_id_test = copy_availabe_images( low_id_test, img_low_source_dir, img_low_destination_dir, col_name )

low_ood_test = copy_availabe_images( low_ood_test, img_low_source_dir, img_low_destination_dir, col_name )

fgrained-1.png
<class 'str'>
Image (at index: 0) found and copied: fgrained-1.png
fgrained-5.png
<class 'str'>
Image (at index: 1) found and copied: fgrained-5.png
fgrained-12.png
<class 'str'>
Image (at index: 2) found and copied: fgrained-12.png
fgrained-20.png
<class 'str'>
Image (at index: 3) found and copied: fgrained-20.png
fgrained-33.png
<class 'str'>
Image (at index: 4) found and copied: fgrained-33.png
fgrained-43.png
<class 'str'>
Image (at index: 5) found and copied: fgrained-43.png
fgrained-3487.png
<class 'str'>
Image (at index: 6) found and copied: fgrained-3487.png
fgrained-3496.png
<class 'str'>
Image (at index: 7) found and copied: fgrained-3496.png
fgrained-3505.png
<class 'str'>
Image (at index: 8) found and copied: fgrained-3505.png
fgrained-3512.png
<class 'str'>
Image (at index: 9) found and copied: fgrained-3512.png
fgrained-3520.png
<class 'str'>
Image (at index: 10) found and copied: fgrained-3520.png
fgrained-3530.png
<class 'str'>
Image (at index: 11) found 

### Save df

In [52]:
low_id_test.to_csv( df_destination_dir/ 'low_id_test.csv' )

low_ood_test.to_csv( df_destination_dir/ 'low_ood_test.csv' )