In [9]:
import os
import json
import pandas as pd
import requests
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor
import subprocess

# Configuration
BASE_URL = "https://api.inaturalist.org/v1/observations"
PARAMS = {
    "verifiable": "true",
    "order_by": "id",
    "order": "desc",  # Get newest observations first
    "page": 1,
    "spam": "false",
    "place_id": 6712,
    "iconic_taxa[]": "Fungi",
    "photos": "true",
    "locale": "en-US",
    "per_page": 200,
    "no_total_hits": "true"
}
MAX_PAGES_PER_BATCH = 50  # Each batch gets 10k observations (50 pages * 200 per page)
NUM_BATCHES = 5  # Total of 10 sliding window iterations (100k observations)
MAX_WORKERS = 500  # Number of threads for downloading images

# Prepare directories
base_image_dir = "images_mushrooms"
os.makedirs(base_image_dir, exist_ok=True)

# Prepare CSV data storage
csv_data = []
id_below = None  # Start with no id_below for the first batch

# Dictionary to track appearance count per taxon_id
appearance_count = {}

for batch in range(NUM_BATCHES):
    print(f"Starting batch {batch + 1}...")
    batch_dir = os.path.join(base_image_dir, f"batch_{batch + 1}")
    os.makedirs(batch_dir, exist_ok=True)
    lowest_id = None  # Track lowest ID in this batch

    for page in range(1, MAX_PAGES_PER_BATCH + 1):
        print(f"Fetching page {page} of batch {batch + 1}")
        PARAMS["page"] = page
        if id_below:
            PARAMS["id_below"] = id_below  # Apply sliding window

        response = requests.get(BASE_URL, params=PARAMS)
        if response.status_code != 200:
            print(f"Failed to fetch data for page {page} in batch {batch + 1}")
            break

        observations = response.json()["results"]
        if not observations:
            print("No more observations found.")
            break

        # Find lowest observation ID in this batch
        batch_ids = [obs["id"] for obs in observations]
        min_id_in_page = min(batch_ids)
        if lowest_id is None or min_id_in_page < lowest_id:
            lowest_id = min_id_in_page

        # Prepare tasks for multithreading
        download_tasks = []
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            for obs in observations:
                obs_id = obs["id"]
                taxon_id = obs["taxon"]["id"]
                coordinates = obs.get("geojson", {}).get("coordinates", [None, None])
                threatened = obs["taxon"].get("threatened", False)

                taxon_dir = os.path.join(batch_dir, str(taxon_id))
                os.makedirs(taxon_dir, exist_ok=True)

                # Increment appearance count for the taxon_id
                if taxon_id not in appearance_count:
                    appearance_count[taxon_id] = 1
                else:
                    appearance_count[taxon_id] += 1

                # Process images
                for photo in obs["photos"]:
                    if "url" in photo:
                        # Construct medium image URL by replacing 'square' with 'medium'
                        image_url = photo["url"].replace("square", "medium")
                        parsed_url = urlparse(image_url)
                        image_ext = os.path.splitext(parsed_url.path)[-1]  # Get file extension
                        image_filename = f"{taxon_id}_{appearance_count[taxon_id]}{image_ext}"
                        image_path = os.path.join(taxon_dir, image_filename)

                        # Submit download task
                        download_tasks.append(executor.submit(subprocess.run, ["curl", "-o", image_path, image_url], check=True))

                        # Add to CSV data
                        csv_data.append([obs_id, taxon_id, coordinates[1], coordinates[0], image_filename, threatened])

            # Wait for all images to download
            for task in download_tasks:
                try:
                    task.result()
                except subprocess.CalledProcessError as e:
                    print(f"Error downloading image with curl: {e}")

    # Update id_below for next batch
    if lowest_id:
        id_below = lowest_id
    print(f"Finished batch {batch + 1}, next id_below: {id_below}")

# Create DataFrame and save as CSV
csv_filename = "observations.csv"
df = pd.DataFrame(csv_data, columns=["Observation ID", "Taxon ID", "Latitude", "Longitude", "Image Filename", "Threatened"])
df.to_csv(csv_filename, index=False)

print(f"Data saved to {csv_filename}")


Starting batch 1...
Fetching page 1 of batch 1
Fetching page 2 of batch 1
Fetching page 3 of batch 1
Fetching page 4 of batch 1
Fetching page 5 of batch 1
Fetching page 6 of batch 1
Fetching page 7 of batch 1
Fetching page 8 of batch 1
Fetching page 9 of batch 1
Fetching page 10 of batch 1
Fetching page 11 of batch 1
Fetching page 12 of batch 1
Fetching page 13 of batch 1
Fetching page 14 of batch 1
Fetching page 15 of batch 1
Fetching page 16 of batch 1
Fetching page 17 of batch 1
Fetching page 18 of batch 1
Fetching page 19 of batch 1
Fetching page 20 of batch 1
Fetching page 21 of batch 1
Fetching page 22 of batch 1
Fetching page 23 of batch 1
Fetching page 24 of batch 1
Fetching page 25 of batch 1
Fetching page 26 of batch 1
Fetching page 27 of batch 1
Fetching page 28 of batch 1
Fetching page 29 of batch 1
Fetching page 30 of batch 1
Fetching page 31 of batch 1
Fetching page 32 of batch 1
Fetching page 33 of batch 1
Fetching page 34 of batch 1
Fetching page 35 of batch 1
Fetching 

In [10]:
!zip -r images_mushrooms.zip images_mushrooms

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: images_mushrooms/batch_3/992465/992465_1.jpg (deflated 2%)
  adding: images_mushrooms/batch_3/128516/ (stored 0%)
  adding: images_mushrooms/batch_3/128516/128516_10.jpeg (deflated 0%)
  adding: images_mushrooms/batch_3/128516/128516_11.jpeg (deflated 0%)
  adding: images_mushrooms/batch_3/209793/ (stored 0%)
  adding: images_mushrooms/batch_3/209793/209793_9.jpeg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/ (stored 0%)
  adding: images_mushrooms/batch_3/123175/123175_186.jpg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/123175_160.jpg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/123175_197.jpeg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/123175_184.jpg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/123175_183.jpg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/123175_168.jpeg (deflated 0%)
  adding: images_mushrooms/batch_3/123175/123175_199.jpg (def

In [11]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)


Mounted at /content/gdrive


In [12]:
!cp /content/images_mushrooms.zip /content/gdrive/MyDrive/

In [None]:
!cp /content/observations.csv /content/gdrive/MyDrive/

In [None]:
!rm -rf images