In [3]:
## Final code to download

import requests
import os
import time
import json
import random
import re

def write_metadata_to_folder(genus, details, parent_folder):
    """
    Write the metadata for a given genus to a metadata.json file inside the specified folder.
    """
    metadata = {genus: details}
    metadata_path = os.path.join(parent_folder, 'metadata.json')
    with open(metadata_path, 'w') as file:
        json.dump(metadata, file, indent=4)

def make_request_with_backoff(url, max_retries=5):
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response
        except (requests.HTTPError, requests.ConnectionError) as e:
            print(f"Error making request to {url}. Error: {e}")
            if attempt < max_retries - 1:  # i.e. if it's not the last attempt
                wait_time = (2 ** attempt) + (random.randint(0, 1000) / 1000)
                print(f"Waiting for {wait_time:.2f} seconds before retrying...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Moving on...")
                return None

def process_genus(name, details):
    print(f"Processing genus: {name}")
    
    # Define the API endpoint for fetching parent ID
    endpoint = f"https://api.inaturalist.org/v1/taxa/autocomplete?q={name}&per_page=50&locale=en&preferred_place_id="
    response = make_request_with_backoff(endpoint)
    if not response:
        return
    data = response.json()

    # Extract the parent_id for the genus
    parent_id = None
    for result in data['results']:
        if result['name'] == name:
            parent_id = result['parent_id']
            break

    # Define the API endpoint for fetching observations using the parent ID
    endpoint = f"https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&page=1&spam=false&taxon_id={parent_id}&locale=en&per_page=50"
    response = make_request_with_backoff(endpoint)
    if not response:
        return
    observations_data = response.json()

    # Extract the IDs from the observations data
    ids = [observation['id'] for observation in observations_data['results']]

    # Fetch and download images for each observation ID
    for observation_id in ids:
        image_urls, observation_name = get_observation_images(observation_id, name)
        if image_urls:
            download_images(image_urls, name, str(observation_id), observation_name)
        delay = random.uniform(3, 7)  # Random delay between 3 to 7 seconds
        print(f"Waiting for {delay:.2f} seconds before the next request...")
        time.sleep(delay)

    # After downloading images, write the metadata to the folder
    genus_folder_path = os.path.join("fungi_images", name)
    write_metadata_to_folder(name, details, genus_folder_path)

def get_observation_images(observation_id, name):
    # Make the request to the observation endpoint
    endpoint = f"https://api.inaturalist.org/v1/observations/{observation_id}?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1"
    response = requests.get(endpoint)
    data = response.json()

    observation_photos = data.get('results', [{}])[0].get('observation_photos', [])
    
    # If observation_photos is empty, try to get the default_photo
    if not observation_photos:
        default_photo = data.get('results', [{}])[0].get('taxon', {}).get('default_photo', {})
        if default_photo:
            square_url = default_photo.get('square_url', '')
            medium_url = default_photo.get('medium_url', '')
            observation_photos = [{'photo': {'url': square_url}}, {'photo': {'url': medium_url}}]

    # Extract the image URLs and replace with 'large.jpeg' or 'medium.jpeg'
    image_urls = []
    for photo in observation_photos:
        base_url = photo.get('photo', {}).get('url', '')
        large_url = base_url.replace('square.jpeg', 'large.jpeg')
        medium_url = base_url.replace('square.jpeg', 'medium.jpeg')
        image_urls.extend([large_url, medium_url])
    
    observation_name = data.get('results', [{}])[0].get('taxon', {}).get('name', '')

    # Use a regular expression to match the desired pattern
    if not re.match(r'^\b' + re.escape(name) + r'\b( \b\w+\b)?$', observation_name):
        return [], observation_name

    return image_urls, observation_name


def download_images(image_urls, parent_folder, subfolder_name, observation_name):
    print(f"Downloading images for {observation_name}...")
    
    parent_folder = os.path.join("fungi_images", parent_folder)
    # Create a parent directory with the 'name' variable if it doesn't exist
    if not os.path.exists(parent_folder):
        os.makedirs(parent_folder)
    
    # Create a subdirectory for the observation ID inside the parent directory
    full_path = os.path.join(parent_folder, subfolder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

    for i, img_url in enumerate(image_urls):
        response = requests.get(img_url)
        file_name = os.path.join(full_path, f"{observation_name}_{i + 1}.jpeg")
        with open(file_name, 'wb') as file:
            file.write(response.content)

# Load the tags.json file
with open('tags.json', 'r') as file:
    tags = json.load(file)

# Process each genus in the tags.json file
for genus, details in tags.items():
    process_genus(genus, details)
    time.sleep(10)

Processing genus: Lachnum
Waiting for 3.12 seconds before the next request...
Downloading images for Lachnum virgineum...
Waiting for 3.33 seconds before the next request...
Waiting for 3.18 seconds before the next request...
Downloading images for Lachnum virgineum...
Waiting for 4.00 seconds before the next request...
Downloading images for Lachnum virgineum...
Waiting for 6.63 seconds before the next request...
Downloading images for Lachnum...
Waiting for 6.54 seconds before the next request...
Waiting for 3.27 seconds before the next request...
Downloading images for Lachnum varians...
Waiting for 5.24 seconds before the next request...
Waiting for 6.23 seconds before the next request...
Downloading images for Lachnum...
Waiting for 5.77 seconds before the next request...
Waiting for 3.58 seconds before the next request...
Waiting for 6.09 seconds before the next request...
Waiting for 3.82 seconds before the next request...
Waiting for 5.20 seconds before the next request...
Wait

SSLError: HTTPSConnectionPool(host='api.inaturalist.org', port=443): Max retries exceeded with url: /v1/observations/163519245?include_new_projects=true&preferred_place_id=&locale=en&ttl=-1 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1002)')))