# Notebook overview
Summarizes validated image downloads for the binary GBIF dataset, verifies downloaded file integrity, and reports failure reasons.

- Loads download-check CSVs for train/val/test splits
- Checks 'image_downloaded' counts and aggregates 'image_download_fail_reason'
- Prints per-split counts and download problems

# Preperation

### Import

In [1]:
import pandas as pd
from pathlib import Path

### Path - df_base_dir

In [2]:
DF_BASE_DIR = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/binary/download_check'
df_base_dir = Path(DF_BASE_DIR)

### Load dfs - high_id_train, high_id_val, high_id_test

In [3]:
high_id_train = pd.read_csv( df_base_dir/'ami-gbif_binary_train_download_check.csv')
high_id_val = pd.read_csv( df_base_dir/'ami-gbif_binary_val_download_check.csv')
high_id_test = pd.read_csv( df_base_dir/'ami-gbif_binary_test_download_check.csv')

# check

### check - 'image_downloaded'

In [4]:
print(f'train id: {high_id_train['image_downloaded'].value_counts()} \n' )
print(f'val id: {high_id_val['image_downloaded'].value_counts()} \n' )
print(f'test id: {high_id_test['image_downloaded'].value_counts()} \n\n' )

train id: image_downloaded
True     484688
False      5312
Name: count, dtype: int64 

val id: image_downloaded
True     69197
False      803
Name: count, dtype: int64 

test id: image_downloaded
True     138426
False      1574
Name: count, dtype: int64 




### check - 'image_download_fail_reason'

In [5]:
print(f'train id: {high_id_train['image_download_fail_reason'].str.extract(r'^([^:]+:[^:]+):|^(.*)$', expand=True).bfill(axis=1).iloc[:, 0].value_counts()} \n' )
print(f'val id: {high_id_val['image_download_fail_reason'].str.extract(r'^([^:]+:[^:]+):|^(.*)$', expand=True).bfill(axis=1).iloc[:, 0].value_counts()} \n' )
print(f'test id: {high_id_test['image_download_fail_reason'].str.extract(r'^([^:]+:[^:]+):|^(.*)$', expand=True).bfill(axis=1).iloc[:, 0].value_counts()} \n\n' )

train id: 0
no error                                                                                        484688
404 Client Error: Not Found for url                                                               5098
HTTPSConnectionPool(host='monarch.calacademy.org', port=443): Max retries exceeded with url        121
invalid image file                                                                                  77
HTTPConnectionPool(host='www.ngbif.org.ng', port=80): Max retries exceeded with url                  7
403 Client Error: Forbidden for url                                                                  4
HTTPConnectionPool(host='digitalgallery.nhm.org', port=8085): Max retries exceeded with url          4
HTTPSConnectionPool(host='files.plutof.ut.ee', port=443): Read timed out. (read timeout=300)         1
Name: count, dtype: int64 

val id: 0
no error                                                                                       69197
404 Client Error: Not Fou