# Notebook overview
Summarizes successful and failed image downloads and analyzes failure reasons for the binary GBIF dataset.

- Loads download CSVs for train/val/test splits
- Checks 'image_downloaded' counts and aggregates 'image_download_fail_reason'

# Preperation

### Import

In [1]:
import pandas as pd
from pathlib import Path

### Path - df_base_dir

In [None]:
# The base path on which the other paths in this code are built.

DF_BASE_DIR = r'/home/jleick/masterArbeitProjekt/final_release/data/datasets/origin/binary/download'
df_base_dir = Path(DF_BASE_DIR)

### Load dfs - high_id_train, high_id_val, high_id_test

In [3]:
high_id_train = pd.read_csv( df_base_dir/'ami-gbif_binary_train_download.csv')
high_id_val = pd.read_csv( df_base_dir/'ami-gbif_binary_val_download.csv')
high_id_test = pd.read_csv( df_base_dir/'ami-gbif_binary_test_download.csv')

# check

### check - 'image_downloaded'

In [4]:
print(f'train id: {high_id_train['image_downloaded'].value_counts()} \n' )
print(f'val id: {high_id_val['image_downloaded'].value_counts()} \n' )
print(f'test id: {high_id_test['image_downloaded'].value_counts()} \n\n' )

train id: image_downloaded
True     484699
False      5301
Name: count, dtype: int64 

val id: image_downloaded
True     69194
False      806
Name: count, dtype: int64 

test id: image_downloaded
True     138424
False      1576
Name: count, dtype: int64 




### check - 'image_download_fail_reason'

In [5]:
print(f'train id: {high_id_train['image_download_fail_reason'].str.extract(r'^([^:]+:[^:]+):|^(.*)$', expand=True).bfill(axis=1).iloc[:, 0].value_counts()} \n' )
print(f'val id: {high_id_val['image_download_fail_reason'].str.extract(r'^([^:]+:[^:]+):|^(.*)$', expand=True).bfill(axis=1).iloc[:, 0].value_counts()} \n' )
print(f'test id: {high_id_test['image_download_fail_reason'].str.extract(r'^([^:]+:[^:]+):|^(.*)$', expand=True).bfill(axis=1).iloc[:, 0].value_counts()} \n\n' )

train id: 0
no error                                                                                       484699
404 Client Error: Not Found for url                                                              5165
HTTPSConnectionPool(host='monarch.calacademy.org', port=443): Max retries exceeded with url       121
HTTPConnectionPool(host='www.ngbif.org.ng', port=80): Max retries exceeded with url                 7
403 Client Error: Forbidden for url                                                                 4
HTTPConnectionPool(host='digitalgallery.nhm.org', port=8085): Max retries exceeded with url         4
Name: count, dtype: int64 

val id: 0
no error                                                                                       69194
404 Client Error: Not Found for url                                                              792
HTTPSConnectionPool(host='monarch.calacademy.org', port=443): Max retries exceeded with url       13
HTTPConnectionPool(host='www.ngbif.