# Notebook overview
Creates a summary of successfully extracted trap images, validates image files, and records the results.

- Loads extracted image metadata CSVs (binary and fine-grain)
- Verifies file existence and image validity (decoding) and adds an 'img_checked' boolean column
- Prints validation results and provides counts of valid/invalid images

# Preperation

### Imports

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from PIL import Image, UnidentifiedImageError

### Path - result_dir_path

In [2]:
### Folder to save results
RESULT_DIR_PATH = '/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/trap/image_checked'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

In [3]:
IMG_BINARY_SOURCE_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary'
img_binary_dir_path = Path(IMG_BINARY_SOURCE_DIR_PATH)
if not img_binary_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {IMG_BINARY_SOURCE_DIR_PATH}")


IMG_FINE_GRAIN_DIR_PATH = r'/home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain'
img_fine_grain_dir_path = Path(IMG_FINE_GRAIN_DIR_PATH)
if not img_fine_grain_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {IMG_FINE_GRAIN_DIR_PATH}")

### Path df - binary_path, fine_grain_path

In [4]:
binary_path = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/trap/traps_binary_img.csv'
fine_grain_path = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/trap/traps_fine_grain_img.csv'

### Load df - binary_df, fine_grain_df

In [5]:
binary_df = pd.read_csv(binary_path, index_col=False)
fine_grain_df = pd.read_csv(fine_grain_path, index_col=False)

# Functions

### Function - add_col_to_df

In [6]:
def add_col_to_df(df: pd.DataFrame, col_name: str):
    df_return = df.copy()

    if col_name not in df.columns:
        df_return[col_name] = False
        print(f'>>> {df_return.shape} - Added column: {col_name}')
    
    return df_return

### Function - is_valid_image

In [7]:
# Checks whether a file is a valid image.
def is_valid_image(file_path: Path) -> bool:
    try:
        with Image.open(file_path) as img:
            img.verify()  # Checks header, does not load the entire image
        return True
    except (UnidentifiedImageError, OSError):
        return False

### Function - check_image

In [16]:
def check_images_in_df( df: pd.DataFrame, source_dir:Path, column_name: str):
    for index in df.index:
        image_path = df.at[index, 'identifier']
        source_path = source_dir / image_path

        if source_path.exists():
            # validate img
            if is_valid_image(source_path):
                df.at[index, column_name] = True
                print(f"{index}: Successful validaited {source_path}")
                continue
            else:
                df.at[index, column_name] = False
                print(f"{index}: validation faield {source_path}")
        else:
            df.at[index, column_name] = False
            print(f"{index}: image file do not exist {source_path}")



### FOR TESTING
# df = pd.read_csv('/home/jleick/masterArbeitProjekt/final_release/data/datasets_created/with_image_resized/high_id_test', index_col=0)
# df_false = df[df['resized'] == False]

# csv_filename = 'df_false_test'

# with requests.Session() as session:
#     for index in df_false.index:
#         download_image(session, index, df_false, img_source_dir, 'image_downloaded', 'image_download_fail_reason')
#         df_false.to_csv(df_source_dir / f'{csv_filename}_check', na_rep="NULL", index=False)

# check

### check - binary

In [17]:
col_name = 'img_checked'

In [18]:
binary_df = add_col_to_df(binary_df, col_name)

In [19]:
check_images_in_df(binary_df, img_binary_dir_path, col_name)

0: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-0.png
1: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-1.png
2: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-2.png
3: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-3.png
4: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-4.png
5: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-5.png
6: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-6.png
7: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/binary/binary-7.png
8: Successful validaited /home/jleick/masterArbeitProjekt/final_

In [20]:
binary_df[col_name].value_counts()

img_checked
True    51210
Name: count, dtype: int64

### check - fine_grain

In [21]:
col_name = 'img_checked'

In [22]:
fine_grain_df = add_col_to_df(fine_grain_df, col_name)

In [23]:
check_images_in_df(fine_grain_df, img_fine_grain_dir_path, col_name)

0: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-0.png
1: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-1.png
2: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-2.png
3: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-3.png
4: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-4.png
5: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-5.png
6: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-6.png
7: Successful validaited /home/jleick/masterArbeitProjekt/final_release/data/images/download/low/fine_grain/fgrained-7.png
8: Successful va

In [24]:
fine_grain_df[col_name].value_counts()

img_checked
True    14105
Name: count, dtype: int64