In [1]:
import os
import json
import requests
from concurrent.futures import ThreadPoolExecutor

def download_image(genus_dir, media, record, metadata):
    image_url = media.get('identifier')
    if image_url:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            image_name = os.path.basename(image_url.split('/')[-1]) + ".jpg"  # Extract the image name from the URL
            image_path = os.path.join(genus_dir, image_name)
            with open(image_path, 'wb') as img_file:
                for chunk in response.iter_content(1024):
                    img_file.write(chunk)
            
            # Add entry to metadata
            metadata[image_name] = {
                "family": record.get('family', ""),
                "genus": record.get('genus', ""),
                "species": record.get('species', ""),
                "order": record.get('order', ""),
                "phylum": record.get('phylum', ""),
                "class": record.get('class', ""),
                "kingdom": record.get('kingdom', "")
            }

# Load the JSON data
with open("all_data_colombia copy.json", "r") as file:
    data = json.load(file)

# Assuming the root of the JSON is a list of records
data_to_process = data

# Create a root directory for all images if it doesn't exist
if not os.path.exists('fungi_images'):
    os.mkdir('fungi_images')

# Use ThreadPoolExecutor for parallel downloads
with ThreadPoolExecutor(max_workers=10) as executor:
    # Iterate over each record in the data
    for record in data_to_process:
        genus_name = record.get('genus', "unknown_genus").replace(" ", "_")  # Replace spaces with underscores
        media_entries = record.get('media', [])
        
        # Create a directory for the genus if it doesn't exist
        genus_dir = os.path.join('fungi_images', genus_name)
        if not os.path.exists(genus_dir):
            os.mkdir(genus_dir)
        
        # Metadata dictionary for the genus
        metadata = {}
        
        # Parallel download of images
        futures = [executor.submit(download_image, genus_dir, media, record, metadata) for media in media_entries]
        for future in futures:
            future.result()
        
        # Save metadata to JSON file
        with open(os.path.join(genus_dir, "metadata.json"), "w") as meta_file:
            json.dump(metadata, meta_file, indent=4)

print("Images downloaded and metadata files created!")


Images downloaded and metadata files created!
