In [1]:
import os
import requests
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import pandas as pd

API von iNaturalist nutzen um Bilder für Kategorien zu scrapen, die weniger Bilder als der Median (263) haben
("aves" rausgelassen weil war zwar unterrepräsentiert aber aves = vögel und zu ungenau fürs scraping)

In [2]:
species_list = [
     "francolins", "owls", "columbidae", "corvus capensis",
    "equus asinus", "eupodotis rueppellii", "lepus capensis", "mellivora capensis",
    "numididae", "oreotragus oreotragus", "otocyon megalotis", "procavia capensis",
    "pronolagus randensis", "tragelaphus strepsiceros", "vulpes chama", "papio anubis", "raphicerus campestris"
]

taxon_ids = {}

for species in species_list:
    url = f"https://api.inaturalist.org/v1/taxa?q={species}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if data["results"]:
            taxon_id = data["results"][0]["id"]
            taxon_ids[species] = taxon_id
        else:
            print(f"No taxon ID found for {species}")

print("Taxon IDs:", taxon_ids)
print("Anzahl IDs:", taxon_ids.__len__())

Taxon IDs: {'francolins': 343747, 'owls': 19350, 'columbidae': 2715, 'corvus capensis': 8006, 'equus asinus': 148030, 'eupodotis rueppellii': 1505960, 'lepus capensis': 57560, 'mellivora capensis': 41834, 'numididae': 1426, 'oreotragus oreotragus': 42383, 'otocyon megalotis': 42095, 'procavia capensis': 43086, 'pronolagus randensis': 43159, 'tragelaphus strepsiceros': 42339, 'vulpes chama': 42055, 'papio anubis': 74832, 'raphicerus campestris': 42375}
Anzahl IDs: 17


In [8]:
# Funktion zum Abrufen und Speichern der Bilder
def download_images():
    for species, taxon_id in taxon_ids.items():
        print(f"\n🔍 Scraping {species} (ID: {taxon_id})...")
        save_dir = os.path.join("inat_images", species.replace(" ", "_"))
        os.makedirs(save_dir, exist_ok=True)  
        
        url = "https://api.inaturalist.org/v1/observations"
        params = {
            "taxon_id": taxon_id,
            "per_page": 200,  # Maximal 200 Bilder pro Anfrage
            "order": "desc",
            "order_by": "created_at",
            "captive": "false"  # NUR WILDE TIERE
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            images = [obs["photos"][0]["url"] for obs in data["results"] if "photos" in obs and obs["photos"]]

            if not images:
                print(f"⚠️ Keine Bilder für {species} gefunden.")
                continue

            print(f"📸 {len(images)} Bilder gefunden, starte Download...")

            for i, img_url in tqdm(enumerate(images), total=len(images), desc=f"📥 {species}"):
                try:
                    img_data = requests.get(img_url).content
                    img = Image.open(BytesIO(img_data))
                    img.save(os.path.join(save_dir, f"{species}_{i+1}.jpg"))
                except Exception as e:
                    print(f"❌ Fehler beim Speichern von {img_url}: {e}")

# Starte den Bild-Download
download_images()
print("\n✅ Alle Bilder wurden erfolgreich heruntergeladen!")


🔍 Scraping papio anubis (ID: 74832)...
📸 192 Bilder gefunden, starte Download...


📥 papio anubis:  62%|██████▏   | 119/192 [00:52<00:30,  2.36it/s]

❌ Fehler beim Speichern von https://inaturalist-open-data.s3.amazonaws.com/photos/470087459/square.png: cannot write mode RGBA as JPEG


📥 papio anubis: 100%|██████████| 192/192 [01:26<00:00,  2.23it/s]



🔍 Scraping raphicerus campestris (ID: 42375)...
📸 199 Bilder gefunden, starte Download...


📥 raphicerus campestris:  36%|███▌      | 71/199 [00:31<00:56,  2.25it/s]

❌ Fehler beim Speichern von https://inaturalist-open-data.s3.amazonaws.com/photos/469117068/square.png: cannot write mode RGBA as JPEG


📥 raphicerus campestris: 100%|██████████| 199/199 [01:29<00:00,  2.23it/s]


✅ Alle Bilder wurden erfolgreich heruntergeladen!





In [9]:
observations = []

for species, taxon_id in taxon_ids.items():
    url = "https://api.inaturalist.org/v1/observations"
    params = {
        "taxon_id": taxon_id,
        "per_page": 10,  # Number of observations per request
        "order": "desc",
        "order_by": "created_at"
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        for obs in data["results"]:
            observations.append({
                "species": species,
                "taxon_id": taxon_id,
                "observation_id": obs["id"],
                "user": obs["user"]["login"],
                "date": obs["observed_on"],
                "location": obs["location"],
                "image": obs["photos"][0]["url"] if obs["photos"] else None
            })

# Display results
df = pd.DataFrame(observations)

df.head()

Unnamed: 0,species,taxon_id,observation_id,user,date,location,image
0,aves,3,265056703,vladimir_teplouhov,2025-03-12,"56.9524564201,72.640564218",https://inaturalist-open-data.s3.amazonaws.com...
1,aves,3,265056702,feralbeetle,2025-03-12,"42.2900979733,-71.1880051252",
2,aves,3,265056696,ludgera,2025-03-12,"49.0860588455,12.8714757598",
3,aves,3,265056687,cornerautenbach,2021-02-12,"-27.0287292,32.7872342",https://inaturalist-open-data.s3.amazonaws.com...
4,aves,3,265056681,noahwoaha,2025-03-11,"-12.8301606526,-69.4791064784",https://inaturalist-open-data.s3.amazonaws.com...


In [10]:
df['image'][0]

'https://inaturalist-open-data.s3.amazonaws.com/photos/476031305/square.jpeg'

In [None]:
# Pfade
inat_path = "inat_images"
dataset_path = "balanced_dataset_split/train"

# Mapping von iNaturalist-Namen (links) zu den lateinischen Namen in deinem Datensatz (rechts)
class_mapping = {
    "columbidae": "columbidae",
    "owls": "cn-owls",  # Korrektur: iNaturalist-Name ist "cn-owls"
    "francolins": "cn-francolins",  # Korrektur: iNaturalist-Name ist "cn-francolins"
    "corvus capensis": "corvus_capensis",
    "equus asinus": "equus_asinus",
    "eupodotis rueppellii": "eupodotis_rueppellii",
    "lepus capensis": "lepus_capensis",
    "mellivora capensis": "mellivora_capensis",
    "numididae": "numididae",
    "oreotragus oreotragus": "oreotragus_oreotragus",
    "otocyon megalotis": "otocyon_megalotis",
    "procavia capensis": "procavia_capensis",
    "pronolagus randensis": "pronolagus_randensis",
    "tragelaphus strepsiceros": "tragelaphus_strepsiceros",
    "vulpes chama": "vulpes_chama",
}

In [3]:
inat_path = "inat_images"

# Dictionary, um die Anzahl der Bilder pro Klasse zu speichern
class_counts = {}

# Iteriere über alle Unterordner (Klassen)
for class_name in os.listdir(inat_path):
    class_folder = os.path.join(inat_path, class_name)
    
    # Überprüfen, ob es wirklich ein Ordner ist
    if os.path.isdir(class_folder):
        # Liste alle Dateien im Ordner und filtere nur Bilddateien
        images = [file for file in os.listdir(class_folder) if file.lower().endswith(('.png', '.jpg', '.jpeg'))]
        class_counts[class_name] = len(images)

# Ausgabe
for class_name, count in class_counts.items():
    print(f"Klasse '{class_name}': {count} Bilder")

Klasse 'cn-francolins': 196 Bilder
Klasse 'cn-owls': 180 Bilder
Klasse 'columbidae': 183 Bilder
Klasse 'corvus_capensis': 196 Bilder
Klasse 'equus_asinus': 197 Bilder
Klasse 'eupodotis_rueppellii': 200 Bilder
Klasse 'lepus_capensis': 189 Bilder
Klasse 'mellivora_capensis': 197 Bilder
Klasse 'numididae': 195 Bilder
Klasse 'oreotragus_oreotragus': 197 Bilder
Klasse 'otocyon_megalotis': 192 Bilder
Klasse 'papio_anubis': 191 Bilder
Klasse 'procavia_capensis': 195 Bilder
Klasse 'pronolagus_randensis': 62 Bilder
Klasse 'raphiceros_campestris': 198 Bilder
Klasse 'tragelaphus_strepsiceros': 200 Bilder
Klasse 'vulpes_chama': 191 Bilder


In [4]:
# Anzahl Bilder insgesamt
total_images = sum(class_counts.values())
print(f"\nInsgesamt {total_images} Bilder in {len(class_counts)} Klassen gefunden.")


Insgesamt 3159 Bilder in 17 Klassen gefunden.
