# Notebook overview
Ensures images are present, downloads missing ones, checks image validity, and records results.

- Selects relevant CSVs and iterates over entries
- Copies from cache or downloads missing images (concurrent threads)
- Validates image files and updates tracking columns ('image_downloaded', 'image_download_fail_reason')
- Saves updated CSVs

The notebook was exported as a Python script and run in a console using Tmux to execute it.

### imports

In [1]:
from pathlib import Path
import pandas as pd

import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image, UnidentifiedImageError

### Variable - MAX_WORKER

In [2]:
MAX_WORKER = 1

### Path - df_source_dir

In [3]:
DF_SOURCE_DIR = r'/home/jleick/masterArbeitProjekt/data/ami_dataset/ami_gbif/fine-grained_classification/metadata'
df_source_dir = Path(DF_SOURCE_DIR)

### Path - img_source_dir

In [4]:
IMG_SOURCE_DIR = r'/home/jleick/masterArbeitProjekt/data/ami_images_temp/ami_gbif/cached_images'
img_source_dir = Path(IMG_SOURCE_DIR)

### Function - select_files and apply select_files

In [5]:
### Select csv files in given folder Path

def select_files(source_dir_path: Path):
    dir_contains = source_dir_path.iterdir()
    dir_filtered = [] 
    # Filter all relevant files
    for file_path in dir_contains:
        filename = file_path.name
        ### adapt conditions for specific csv files in folder
        if ('download' in filename and '04' in filename):
            dir_filtered.append(filename)

    return dir_filtered

#call funktion
selected_csv_files = select_files(df_source_dir)

# Print all selected files
for folder_name in selected_csv_files:
    print(folder_name)

# DUBLICATED CODE (CODE EXIST IN OTHER FILE TOO - CODE ADAPTED)

04_ami-gbif_fine-grained_all_val_download.csv
04_ami-gbif_fine-grained_all_test_download.csv
04_ami-gbif_fine-grained_all_train_download.csv


### Function - is_valid_image

In [None]:
# Checks whether a file is a valid image.
def is_valid_image(file_path: Path) -> bool:
    try:
        with Image.open(file_path) as img:
            img.verify()  # Checks header, does not load the entire image
        return True
    except (UnidentifiedImageError, OSError):
        return False

### Function - download_image

In [None]:
def download_image(session, index: int, df: pd.DataFrame, source_dir:Path, column_name: str, column_name_fail: str):
    url = df.at[index, 'identifier']
    image_path = df.at[index, 'image_path']

    source_path = source_dir / image_path
    source_path.parent.mkdir(parents=True, exist_ok=True)


    if source_path.exists():
        # validate img
        if is_valid_image(source_path):
            df.at[index, 'image_downloaded'] = True
            df.at[index, column_name_fail] = "no error"
            print(f"{index}: Successful validaited {url}")
            return


    try:
        with session.get(url, stream=True, timeout=300) as response:
            response.raise_for_status()

            with open(source_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

        # validate img
        if is_valid_image(source_path):
            df.at[index, 'image_downloaded'] = True
            df.at[index, column_name_fail] = "no error"
            print(f"{index}: Successful download {url}")
        else:
            # remove invalide img
            source_path.unlink(missing_ok=True) # remove invalide img file
            df.at[index, 'image_downloaded'] = False
            df.at[index, column_name_fail] = "invalid image file"
            print(f"{index}: Invalid image file after download {url}")

    except Exception as e:
        df.at[index, 'image_downloaded'] = False
        df.at[index, column_name_fail] = str(e)
        print(f"{index}: Failed download {url}: {e}")

    return



### FOR TESTING
# df = pd.read_csv('/home/jleick/masterArbeitProjekt/final_release/data/datasets_created/with_image_resized/high_id_test', index_col=0)
# df_false = df[df['resized'] == False]

# csv_filename = 'df_false_test'

# with requests.Session() as session:
#     for index in df_false.index:
#         download_image(session, index, df_false, img_source_dir, 'image_downloaded', 'image_download_fail_reason')
#         df_false.to_csv(df_source_dir / f'{csv_filename}_check', na_rep="NULL", index=False)

### Function - download_images_with_executorpool

In [None]:
### Execute download_image function with ThreadPoolExecutor

def download_images_with_executorpool(max_workers: int, session: requests.Session, download_image: callable, df: pd.DataFrame, source_dir: Path, column_name_download: str, column_name_download_fail: str):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = [executor.submit(download_image, session, index, df, source_dir, column_name_download, column_name_download_fail) for index in df.index]
         for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f'Error while retreving result from future: {e}')

### Run

In [None]:
with requests.Session() as session:
    for csv_filename in selected_csv_files:
       df = pd.read_csv( df_source_dir / csv_filename, index_col = 0 )
       df.reset_index(drop=True,inplace=True)
       print(f'process: {df_source_dir / csv_filename} --- START')
       download_images_with_executorpool( MAX_WORKER, session, download_image, df, img_source_dir, 'image_downloaded', 'image_download_fail_reason' )
       df.to_csv(df_source_dir / f'{csv_filename}_check', na_rep="NULL", index=False)
       print(f'process: {df_source_dir / csv_filename} --- END\n\n\n')