In [33]:
import requests
import os
import re
import time

name = "Orbilia"

# Define the API endpoint for fetching parent ID
endpoint = f"https://api.inaturalist.org/v1/taxa/autocomplete?q={name}&per_page=50&locale=en&preferred_place_id="

# Make the API call with exponential backoff
for _ in range(5):  # retry up to 5 times
    try:
        response = requests.get(endpoint)
        response.raise_for_status()
        data = response.json()
        break
    except requests.HTTPError:
        time.sleep(2**_)

# Extract the parent_id for Orbilia
parent_id = None
for result in data['results']:
    if result['name'] == name:
        parent_id = result['parent_id']
        break

print(f"Parent ID for {name}: {parent_id}")

# Define the API endpoint for fetching observations using the parent ID
endpoint = f"https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&page=1&spam=false&taxon_id={parent_id}&locale=en&per_page=50"

# Make the API call with exponential backoff
for _ in range(5):  # retry up to 5 times
    try:
        response = requests.get(endpoint)
        response.raise_for_status()
        observations_data = response.json()
        break
    except requests.HTTPError:
        time.sleep(2**_)

# Extract the IDs from the observations data
ids = [observation['id'] for observation in observations_data['results']]

#print(f"List of IDs: {ids}")


def get_observation_images(observation_id):
    # Make the request to the observation endpoint
    endpoint = f"https://api.inaturalist.org/v1/observations/{observation_id}?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1"
    response = requests.get(endpoint)
    data = response.json()

    # Check the name of the observation
    observation_name = data.get('results', [{}])[0].get('species_guess', '') or ''  # Default to an empty string if None
    print("Observation name: ", observation_name)
    # Use a regular expression to match the desired pattern
    if not re.match(r'^Orbilia( \w+)?$', observation_name):
        return [], None  # Return an empty list and None for the observation name


    # Extract the observation photos
    observation_photos = data.get('results', [{}])[0].get('observation_photos', [])
    
    # Extract the image URLs and replace with 'large.jpeg' or 'medium.jpeg'
    image_urls = []
    for photo in observation_photos:
        base_url = photo.get('photo', {}).get('url', '')
        large_url = base_url.replace('square.jpeg', 'large.jpeg')
        medium_url = base_url.replace('square.jpeg', 'medium.jpeg')
        
        image_urls.extend([large_url, medium_url])
    
    return image_urls, observation_name

def download_images(image_urls, observation_name, parent_folder, subfolder_name):
    # If there are no image URLs, return immediately
    if not image_urls:
        return

    # Create a parent directory with the 'name' variable if it doesn't exist
    if not os.path.exists(parent_folder):
        os.makedirs(parent_folder)
    
    # Create a subdirectory for the observation ID inside the parent directory
    full_path = os.path.join(parent_folder, subfolder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

    for i, img_url in enumerate(image_urls):
        response = requests.get(img_url)
        
        # Use the observation name as a prefix for the filename
        file_name = os.path.join(full_path, f"{observation_name}_image_{i + 1}.jpeg")
        
        with open(file_name, 'wb') as file:
            file.write(response.content)


# Fetch and download images for each observation ID
for observation_id in ids:
    image_urls, observation_name = get_observation_images(observation_id)
    
    # Only download images if there are any and if the observation name is not None
    if image_urls and observation_name:
        download_images(image_urls, observation_name, name, str(observation_id))
        time.sleep(1)  # throttling: introduce a delay between each iteration to avoid hitting rate limits


NameError: name 'ids' is not defined

In [46]:
import requests
import os
import time
import re
import json

def get_observation_images(observation_id):
    # Make the request to the observation endpoint
    endpoint = f"https://api.inaturalist.org/v1/observations/{observation_id}?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1"
    response = requests.get(endpoint)



    if response.status_code == 429:  # Rate limit error
        print("Rate limit reached. Waiting for 60 seconds before retrying...")
        time.sleep(10)  # Wait for 10 seconds
        return get_observation_images(observation_id)  # Retry the request
    
    # Check the response status and content before decoding
    if response.status_code != 200:
        print(f"Error fetching observation {observation_id}. HTTP Status Code: {response.status_code}")
        print(f"Response Content: {response.text}")
        return [], None

    try:
        data = response.json()     
        # Print the observation data
        observation_data = data.get('results', [{}])[0]
        print("observation_data: ", observation_data)

    except json.JSONDecodeError:
        print(f"Invalid JSON response for observation {observation_id}.")
        print(f"Response Content: {response.text}")
        return [], None

    # Check the name of the observation
    observation_name = data.get('results', [{}])[0].get('species_guess', '') or ''  # Default to an empty string if None
    # Use a regular expression to match the desired pattern
    if not re.match(r'^' + name + r'( \w+)?$', observation_name):
        return [], None  # Return an empty list and None for the observation name

    # Extract the observation photos
    observation_photos = data.get('results', [{}])[0].get('observation_photos', [])
    
    # If observation_photos is empty, try to get the default_photo
    if not observation_photos:
        default_photo = data.get('results', [{}])[0].get('taxon', {}).get('default_photo', {})
        if default_photo:
            square_url = default_photo.get('square_url', '')
            medium_url = default_photo.get('medium_url', '')
            observation_photos = [{'photo': {'url': square_url}}, {'photo': {'url': medium_url}}]
    
    return image_urls, observation_name

def download_images(image_urls, observation_name, parent_folder, subfolder_name):
    # If there are no image URLs, return immediately
    if not image_urls:
        return

    # Create a parent directory with the 'name' variable if it doesn't exist
    if not os.path.exists(parent_folder):
        os.makedirs(parent_folder)
    
    # Create a subdirectory for the observation ID inside the parent directory
    full_path = os.path.join(parent_folder, subfolder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

    for i, img_url in enumerate(image_urls):
        response = requests.get(img_url)
        
        # Use the observation name as a prefix for the filename
        file_name = os.path.join(full_path, f"{observation_name}_image_{i + 1}.jpeg")
        
        with open(file_name, 'wb') as file:
            file.write(response.content)

def process_genus(name):
    # Define the API endpoint for fetching parent ID
    endpoint = f"https://api.inaturalist.org/v1/taxa/autocomplete?q={name}&per_page=50&locale=en&preferred_place_id="

    # Make the API call with exponential backoff
    for _ in range(5):  # retry up to 5 times
        print(f"Attempting API call for genus {name} (Attempt {_ + 1})")
        try:
            response = requests.get(endpoint)
            response.raise_for_status()
            
            # Check if the response is valid JSON
            try:
                data = response.json()
            except json.JSONDecodeError:
                print(f"Invalid JSON response for genus {name}.")
                print(f"HTTP Status Code: {response.status_code}")
                print(f"Response Content: {response.text}")
                continue  # Continue to the next retry

            print(f"Successfully retrieved data for genus {name}")
            break
        except requests.HTTPError as e:
            print(f"Error during API call: {e}")
            time.sleep(2**_)


    # Extract the parent_id for the genus
    parent_id = None
    for result in data['results']:
        if result['name'] == name:
            parent_id = result['parent_id']
            break

    print(f"Parent ID for {name}: {parent_id}")

    # Define the API endpoint for fetching observations using the parent ID
    endpoint = f"https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&page=1&spam=false&taxon_id={parent_id}&locale=en&per_page=50"

    # Make the API call with exponential backoff for observations
    for _ in range(5):  # retry up to 5 times
        print(f"Attempting API call for observations of genus {name} (Attempt {_ + 1})")
        try:
            response = requests.get(endpoint)
            response.raise_for_status()
            observations_data = response.json()
            print(f"Successfully retrieved observations for genus {name}")
            break
        except requests.HTTPError as e:
            print(f"Error during API call for observations: {e}")
            time.sleep(2**_)

    # Extract the IDs from the observations data
    ids = [observation['id'] for observation in observations_data['results']]

    print(f"List of IDs for {name}: {ids}")

    # Fetch and download images for each observation ID
    for observation_id in ids:
        print(f"Fetching images for observation ID {observation_id}")
        image_urls, observation_name = get_observation_images(observation_id)
        print("Images urls: ", image_urls)
        
        if image_urls and observation_name:
            print(f"Downloading images for observation ID {observation_id}")
            download_images(image_urls, observation_name, name, str(observation_id))
            time.sleep(1)  # throttling: introduce a delay between each iteration to avoid hitting rate limits

    print(f"Finished processing for genus: {name}")

# Load the JSON data from tags.json
with open('tags.json', 'r') as file:
    tags_data = json.load(file)

# Iterate through each genus in the JSON data
for genus in tags_data:
    process_genus(genus)
    time.sleep(1)


Attempting API call for genus Dacrymyces (Attempt 1)
Successfully retrieved data for genus Dacrymyces
Parent ID for Dacrymyces: 53279
Attempting API call for observations of genus Dacrymyces (Attempt 1)
Successfully retrieved observations for genus Dacrymyces
List of IDs for Dacrymyces: [179741128, 179740092, 179739991, 179738079, 179732657, 179730916, 179730842, 179730203, 179730164, 179726214, 179719310, 179718242, 179718161, 179717590, 179717282, 179714665, 179711208, 179710812, 179707503, 179707465, 179706560, 179703857, 179702165, 179702164, 179702141, 179692816, 179685750, 179678083, 179675570, 179673749, 179670188, 179669387, 179667297, 179666014, 179663953, 179662472, 179660418, 179658197, 179651875, 179651520, 179647530, 179644882, 179644425, 179644169, 179643649, 179642774, 179642257, 179642252, 179640834, 179639372]
Fetching images for observation ID 179741128
observation_data:  {'quality_grade': 'research', 'time_observed_at': '2023-08-20T13:10:00-04:00', 'taxon_geoprivacy'

KeyboardInterrupt: 

In [57]:
## Final code to download

import requests
import os
import time
import json
import random

def write_metadata_to_folder(genus, details, parent_folder):
    """
    Write the metadata for a given genus to a metadata.json file inside the specified folder.
    """
    metadata = {genus: details}
    metadata_path = os.path.join(parent_folder, 'metadata.json')
    with open(metadata_path, 'w') as file:
        json.dump(metadata, file, indent=4)

def make_request_with_backoff(url, max_retries=5):
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response
        except requests.HTTPError as e:
            print(f"Error making request to {url}. HTTP Status Code: {response.status_code}")
            if attempt < max_retries - 1:  # i.e. if it's not the last attempt
                wait_time = (2 ** attempt) + (random.randint(0, 1000) / 1000)
                print(f"Waiting for {wait_time} seconds before retrying...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Moving on...")
                return None

def process_genus(name, details):
    print(f"Processing genus: {name}")
    
    # Define the API endpoint for fetching parent ID
    endpoint = f"https://api.inaturalist.org/v1/taxa/autocomplete?q={name}&per_page=50&locale=en&preferred_place_id="
    response = make_request_with_backoff(endpoint)
    if not response:
        return
    data = response.json()

    # Extract the parent_id for the genus
    parent_id = None
    for result in data['results']:
        if result['name'] == name:
            parent_id = result['parent_id']
            break

    # Define the API endpoint for fetching observations using the parent ID
    endpoint = f"https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&page=1&spam=false&taxon_id={parent_id}&locale=en&per_page=50"
    response = make_request_with_backoff(endpoint)
    if not response:
        return
    observations_data = response.json()

    # Extract the IDs from the observations data
    ids = [observation['id'] for observation in observations_data['results']]

    # Fetch and download images for each observation ID
    for observation_id in ids:
        image_urls, observation_name = get_observation_images(observation_id, name)
        if image_urls:
            download_images(image_urls, name, str(observation_id), observation_name)
        time.sleep(1)  # throttling: introduce a delay between each iteration to avoid hitting rate limits

    # After downloading images, write the metadata to the folder
    genus_folder_path = os.path.join("fungi_images", name)
    write_metadata_to_folder(name, details, genus_folder_path)

def get_observation_images(observation_id, name):
    # Make the request to the observation endpoint
    endpoint = f"https://api.inaturalist.org/v1/observations/{observation_id}?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1"
    response = requests.get(endpoint)
    data = response.json()

    observation_photos = data.get('results', [{}])[0].get('observation_photos', [])
    
    # If observation_photos is empty, try to get the default_photo
    if not observation_photos:
        default_photo = data.get('results', [{}])[0].get('taxon', {}).get('default_photo', {})
        if default_photo:
            square_url = default_photo.get('square_url', '')
            medium_url = default_photo.get('medium_url', '')
            observation_photos = [{'photo': {'url': square_url}}, {'photo': {'url': medium_url}}]

    # Extract the image URLs and replace with 'large.jpeg' or 'medium.jpeg'
    image_urls = []
    for photo in observation_photos:
        base_url = photo.get('photo', {}).get('url', '')
        large_url = base_url.replace('square.jpeg', 'large.jpeg')
        medium_url = base_url.replace('square.jpeg', 'medium.jpeg')
        image_urls.extend([large_url, medium_url])
    
    observation_name = data.get('results', [{}])[0].get('taxon', {}).get('name', '')

    # Use a regular expression to match the desired pattern
    if not re.match(r'^\b' + re.escape(name) + r'\b( \b\w+\b)?$', observation_name):
        return [], observation_name

    return image_urls, observation_name


def download_images(image_urls, parent_folder, subfolder_name, observation_name):
    print(f"Downloading images for {observation_name}...")
    
    parent_folder = os.path.join("fungi_images", parent_folder)
    # Create a parent directory with the 'name' variable if it doesn't exist
    if not os.path.exists(parent_folder):
        os.makedirs(parent_folder)
    
    # Create a subdirectory for the observation ID inside the parent directory
    full_path = os.path.join(parent_folder, subfolder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

    for i, img_url in enumerate(image_urls):
        response = requests.get(img_url)
        file_name = os.path.join(full_path, f"{observation_name}_{i + 1}.jpeg")
        with open(file_name, 'wb') as file:
            file.write(response.content)

# Load the tags.json file
with open('tags.json', 'r') as file:
    tags = json.load(file)

# Process each genus in the tags.json file
for genus, details in tags.items():
    process_genus(genus, details)

Processing genus: Amanita
Downloading images for Amanita...
Downloading images for Amanita fulva...
Downloading images for Amanita...
Downloading images for Amanita muscaria...
Downloading images for Amanita vaginata...
Downloading images for Amanita rubescens...
Downloading images for Amanita...
Downloading images for Amanita...
Downloading images for Amanita...
Downloading images for Amanita...
Downloading images for Amanita muscaria...
Downloading images for Amanita pantherina...
Downloading images for Amanita muscaria...
Downloading images for Amanita flavorubens...
Downloading images for Amanita...
Downloading images for Amanita citrina...
Downloading images for Amanita...
Downloading images for Amanita flavorubens...
Downloading images for Amanita...
Downloading images for Amanita rubescens...
Downloading images for Amanita muscaria...
Downloading images for Amanita rubescens...
Downloading images for Amanita bisporigera...
Downloading images for Amanita flavorubens...
Downloadin

KeyboardInterrupt: 