In [None]:
import pandas as pd
import os
import shutil

In [None]:
##########################################################################################
#                               Data Cleaning                                            #
##########################################################################################

base_path = "results"         
output_path = "filtered_species"
os.makedirs(output_path, exist_ok=True)

species_list = [
    "Anas_platyrhynchos", "Ardea_herodias", "Bombycilla_cedrorum", "Branta_canadensis",
    "Bubo_virginianus", "Buteo_jamaicensis", "Calidris_alba", "Cardinalis_cardinalis",
    "Cathartes_aura", "Coragyps_atratus", "Corvus_cornix", "Dryocopus_pileatus",
    "Falco_sparverius", "Icterus_galbula", "Parus_major", "Passer_domesticus",
    "Quiscalus_mexicanus", "Sialia_sialis", "Sturnus_vulgaris", "Turdus_merula",
    "Turdus_migratorius", "Zenaida_macroura"
]

for species in species_list:
    species_name = species.replace("_", " ")
    metadata_file = os.path.join(base_path, f"{species}_metadata.csv")
    image_folder = os.path.join(base_path, f"{species}_images")
    output_image_folder = os.path.join(output_path, f"{species}_filtered_images")
    output_csv_file = os.path.join(output_path, f"{species}_filtered_metadata.csv")
    os.makedirs(output_image_folder, exist_ok=True)

    # CSV recreate
    df = pd.read_csv(metadata_file, encoding="latin1")
    df_filtered = df[df["species_name"] == species_name]
    df_filtered[["species_name", "observation_id"]].to_csv(output_csv_file, index=False)

    # Image copy
    observation_ids = df_filtered["observation_id"].astype(str).tolist()
    image_files = os.listdir(image_folder)
    copied = 0

    for obs_id in observation_ids:
        match_suffix = f"_{obs_id}_0.jpeg"
        matched_files = [f for f in image_files if f.endswith(match_suffix)]

        for file in matched_files:
            src = os.path.join(image_folder, file)
            dst = os.path.join(output_image_folder, file)
            try:
                shutil.copyfile(src, dst)
                copied += 1
            except Exception as e:
                print(f"Error copying {file}: {e}")

    print(f"Copied {copied} image(s) for {species}, CSV rows: {len(df_filtered)}")


In [None]:
##########################################################################################
#                               Data Verifying                                           #
##########################################################################################

for species in species_list:
    csv_path = os.path.join(output_path, f"{species}_filtered_metadata.csv")
    img_path = os.path.join(output_path, f"{species}_filtered_images")

    df = pd.read_csv(csv_path)
    csv_obs_ids = set(df["observation_id"].astype(str))
    image_obs_ids = set()

    for filename in os.listdir(img_path):
        if filename.endswith("_0.jpeg"):
            parts = filename.split("_")
            try:
                obs_id = parts[-2]
                image_obs_ids.add(obs_id)
            except IndexError:
                print(f"Invalid filename: {filename}")

    missing = csv_obs_ids - image_obs_ids
    extra = image_obs_ids - csv_obs_ids

    if missing:
        print(f"{species} ? CSV: {len(csv_obs_ids)}, Images: {len(image_obs_ids)}, Missing: {len(missing)} ? {list(missing)[:5]}{'...' if len(missing) > 5 else ''}")


In [None]:
##########################################################################################
#                               Handle inaccurate part                                   #
##########################################################################################


#Copy paste the result of the Previous block.
#Because downloader scrap the realtime based data, missing data result could vary.
missing_data = {
    "Ardea_herodias": ['97898', '3284364'],
    "Passer_domesticus": ["2966935"],
    "Turdus_migratorius": ["3629725"]
}

for species, missing_ids in missing_data.items():
    csv_path = f"{output_path}/{species}_filtered_metadata.csv"
    df = pd.read_csv(csv_path)
    df_cleaned = df[~df["observation_id"].astype(str).isin(missing_ids)]
    df_cleaned.to_csv(csv_path, index=False)
    print(f"Cleaned: {species} — removed {len(missing_ids)} rows")

