In [None]:
import json

# Load the JSON file
with open("nonsense.json", "r") as file:
    data = json.load(file)

# Extract observations from the 'results' key
observations = data.get("results", [])

# Extract image URLs for the genus "Nitschkia"
nitschkia_images = []

for entry in observations:
    if entry.get("taxon", {}).get("name") == "Nitschkia":
        photos = entry.get("photos", [])
        for photo in photos:
            nitschkia_images.append(photo.get("url"))

# Print the extracted image URLs
for idx, url in enumerate(nitschkia_images, 1):
    print(f"Image {idx}: {url}")


In [None]:
import requests
import os
import time

name = "Orbilia"

# Define the API endpoint for fetching parent ID
endpoint = f"https://api.inaturalist.org/v1/taxa/autocomplete?q={name}&per_page=50&locale=en&preferred_place_id="

# Make the API call with exponential backoff
for _ in range(5):  # retry up to 5 times
    try:
        response = requests.get(endpoint)
        response.raise_for_status()
        data = response.json()
        break
    except requests.HTTPError:
        time.sleep(2**_)

# Extract the parent_id for Orbilia
parent_id = None
for result in data['results']:
    if result['name'] == name:
        parent_id = result['parent_id']
        break

print(f"Parent ID for {name}: {parent_id}")

# Define the API endpoint for fetching observations using the parent ID
endpoint = f"https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&page=1&spam=false&taxon_id={parent_id}&locale=en&per_page=50"

# Make the API call with exponential backoff
for _ in range(5):  # retry up to 5 times
    try:
        response = requests.get(endpoint)
        response.raise_for_status()
        observations_data = response.json()
        break
    except requests.HTTPError:
        time.sleep(2**_)

# Extract the IDs from the observations data
ids = [observation['id'] for observation in observations_data['results']]

print(f"List of IDs: {ids}")

def get_observation_images(observation_id):
    # Make the request to the observation endpoint
    endpoint = f"https://api.inaturalist.org/v1/observations/{observation_id}?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1"
    response = requests.get(endpoint)
    data = response.json()

    # Extract the observation photos
    observation_photos = data.get('results', [{}])[0].get('observation_photos', [])
    
    # Extract the image URLs and replace with 'large.jpeg' or 'medium.jpeg'
    image_urls = []
    for photo in observation_photos:
        base_url = photo.get('photo', {}).get('url', '')
        large_url = base_url.replace('square.jpeg', 'large.jpeg')
        medium_url = base_url.replace('square.jpeg', 'medium.jpeg')
        
        image_urls.extend([large_url, medium_url])
    
    return image_urls

def download_images(image_urls, parent_folder, subfolder_name):
    # Create a parent directory with the 'name' variable if it doesn't exist
    if not os.path.exists(parent_folder):
        os.makedirs(parent_folder)
    
    # Create a subdirectory for the observation ID inside the parent directory
    full_path = os.path.join(parent_folder, subfolder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

    for i, img_url in enumerate(image_urls):
        response = requests.get(img_url)
        file_name = os.path.join(full_path, f'image_{i + 1}.jpeg')
        with open(file_name, 'wb') as file:
            file.write(response.content)

# Fetch and download images for each observation ID
for observation_id in ids:
    image_urls = get_observation_images(observation_id)
    download_images(image_urls, name, str(observation_id))
    time.sleep(1)  # throttling: introduce a delay between each iteration to avoid hitting rate limits


In [16]:
import requests
import os
import re
import time

name = "Orbilia"

# Define the API endpoint for fetching parent ID
endpoint = f"https://api.inaturalist.org/v1/taxa/autocomplete?q={name}&per_page=50&locale=en&preferred_place_id="

# Make the API call with exponential backoff
for _ in range(5):  # retry up to 5 times
    try:
        response = requests.get(endpoint)
        response.raise_for_status()
        data = response.json()
        break
    except requests.HTTPError:
        time.sleep(2**_)

# Extract the parent_id for Orbilia
parent_id = None
for result in data['results']:
    if result['name'] == name:
        parent_id = result['parent_id']
        break

print(f"Parent ID for {name}: {parent_id}")

# Define the API endpoint for fetching observations using the parent ID
endpoint = f"https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&page=1&spam=false&taxon_id={parent_id}&locale=en&per_page=50"

# Make the API call with exponential backoff
for _ in range(5):  # retry up to 5 times
    try:
        response = requests.get(endpoint)
        response.raise_for_status()
        observations_data = response.json()
        break
    except requests.HTTPError:
        time.sleep(2**_)

# Extract the IDs from the observations data
ids = [observation['id'] for observation in observations_data['results']]

#print(f"List of IDs: {ids}")


def get_observation_images(observation_id):
    # Make the request to the observation endpoint
    endpoint = f"https://api.inaturalist.org/v1/observations/{observation_id}?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1"
    response = requests.get(endpoint)
    data = response.json()

    # Check the name of the observation
    observation_name = data.get('results', [{}])[0].get('species_guess', '') or ''  # Default to an empty string if None
    print("Observation name: ", observation_name)
    # Use a regular expression to match the desired pattern
    if not re.match(r'^Orbilia( \w+)?$', observation_name):
        return [], None  # Return an empty list and None for the observation name


    # Extract the observation photos
    observation_photos = data.get('results', [{}])[0].get('observation_photos', [])
    
    # Extract the image URLs and replace with 'large.jpeg' or 'medium.jpeg'
    image_urls = []
    for photo in observation_photos:
        base_url = photo.get('photo', {}).get('url', '')
        large_url = base_url.replace('square.jpeg', 'large.jpeg')
        medium_url = base_url.replace('square.jpeg', 'medium.jpeg')
        
        image_urls.extend([large_url, medium_url])
    
    return image_urls, observation_name

def download_images(image_urls, observation_name, parent_folder, subfolder_name):
    # If there are no image URLs, return immediately
    if not image_urls:
        return

    # Create a parent directory with the 'name' variable if it doesn't exist
    if not os.path.exists(parent_folder):
        os.makedirs(parent_folder)
    
    # Create a subdirectory for the observation ID inside the parent directory
    full_path = os.path.join(parent_folder, subfolder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

    for i, img_url in enumerate(image_urls):
        response = requests.get(img_url)
        
        # Use the observation name as a prefix for the filename
        file_name = os.path.join(full_path, f"{observation_name}_image_{i + 1}.jpeg")
        
        with open(file_name, 'wb') as file:
            file.write(response.content)


# Fetch and download images for each observation ID
for observation_id in ids:
    image_urls, observation_name = get_observation_images(observation_id)
    
    # Only download images if there are any and if the observation name is not None
    if image_urls and observation_name:
        download_images(image_urls, observation_name, name, str(observation_id))
        time.sleep(1)  # throttling: introduce a delay between each iteration to avoid hitting rate limits


Parent ID for Orbilia: 351113
Observation name:  Orbilia
Observation name:  Orbilia
Observation name:  Orbilia


KeyboardInterrupt: 