In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Remove existing files and directories from the dataset directory
import os
import shutil

dataset_dir = '/content/drive/MyDrive/dataset_species2' # Define the dataset directory

if os.path.exists(dataset_dir):
    for item in os.listdir(dataset_dir):
        item_path = os.path.join(dataset_dir, item)
        try:
            if os.path.isfile(item_path):
                os.remove(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        except OSError as e:
            print(f"Could not remove {item_path}: {e}") # Print error for specific item
    print(f"Contents of {dataset_dir} cleared.")
else:
    print(f"Dataset directory {dataset_dir} does not exist.")

Contents of /content/drive/MyDrive/dataset_species cleared.


In [9]:
import os
import requests
from PIL import Image
from io import BytesIO
from torchvision import transforms
import random

'''
CLASSES = {
    "beetles": "Coleoptera",
    "butterflies & moths": "Lepidoptera",
    "spiders": "Araneae",
    "bees & wasps": "Hymenoptera",
    "mantises": "Mantodea",
    "stick bugs": "Phasmatodea",
    "snails & slugs": "Stylommatophora",
    "millipedes": "Diplopoda",
    "centipedes": "Chilopoda"
}
'''
'''
CLASSES = {
    "monarch_butterfly": "Danaus plexippus",
    "lady_bug": "Coccinellidae",  # family containing lady beetles
    "firefly": "Lampyridae",      # fireflies are a family
    "bold_jumping_spider": "Phidippus audax",
    "wolf_spider": "Lycosidae",
    "carpenter_bee": "Xylocopa"
}
'''
CLASSES = {
    "yellow_jacket": "name",
    "atlas_moth": "name",
    "stick1": "name",
    "stick2": "name",
    "Mantis1": "name",
    "mantis2": "name",
    "snail": "name",
    "slug": "name",
    "millipede": "name",
    "centipede": "name"
}

BASE_DIR = "/content/drive/MyDrive/dataset_species2"
os.makedirs(BASE_DIR, exist_ok=True)

resize_transform = transforms.Resize((224, 224))

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([
        0.485, 0.456, 0.406],
        [0.229, 0.224, 0.225
    ])
])

TARGET_PER_CLASS = 700

def download_images(class_name, taxon, max_pages, per_page):
  save_train = os.path.join(BASE_DIR, "train", class_name)
  save_val = os.path.join(BASE_DIR, "val", class_name)
  os.makedirs(save_train, exist_ok=True)
  os.makedirs(save_val, exist_ok=True)

  downloaded = 0
  page = 1

  while downloaded <= TARGET_PER_CLASS and page <= max_pages:
    params = { "taxon_name": taxon, "has[photos]": "true",
                   "quality_grade": "needs_id,research", "photo_license": "cc-by",
                   "per_page": per_page, "page": page }

    try:
      response = requests.get(
        "https://api.inaturalist.org/v1/observations",
        params=params,
        timeout=15
      ).json()
    except Exception as e:
          print(f"API request failed on page {page} for {class_name}: {e}")
          break

    if not response.get("results"): break

    for obs in response["results"]:
      if "photos" in obs and len(obs["photos"]) > 0:
        try:
          img_url = obs["photos"][0]["url"].replace("square", "original")
          img_data = requests.get(img_url).content
          img = Image.open(BytesIO(img_data)).convert("RGB")

          # Apply resize transform
          img = resize_transform(img)

          # Random split (80% train, 20% val)
          if random.random() < 0.8:
            save_path = os.path.join(save_train, f"{obs['id']}.jpg")
          else:
            save_path = os.path.join(save_val, f"{obs['id']}.jpg")

          img.save(save_path)
          downloaded += 1
        except Exception as e:
          print("Error:", e)

      if downloaded >= TARGET_PER_CLASS: break

    page += 1

  print(f"{class_name}: downloaded {downloaded}/{TARGET_PER_CLASS}")

for class_name, taxon in CLASSES.items():
  download_images(class_name, taxon, max_pages=500, per_page=100)




monarch_butterfly: downloaded 701/700
lady_bug: downloaded 701/700
firefly: downloaded 701/700
bold_jumping_spider: downloaded 701/700
wolf_spider: downloaded 701/700
carpenter_bee: downloaded 701/700


In [None]:
!ls -R "/content/drive/MyDrive/dataset" | head -n 50

/content/drive/MyDrive/dataset:
train
val

/content/drive/MyDrive/dataset/train:
ant
beetle
butterfly
spider
wasps and bees

/content/drive/MyDrive/dataset/train/ant:

/content/drive/MyDrive/dataset/train/beetle:
307106509.jpg
307133639.jpg
307170142.jpg
307170967.jpg
307188210.jpg
307252734.jpg
307253810.jpg
307261182.jpg
307270761.jpg
307276092.jpg
307278979.jpg
307300942.jpg
307446855.jpg
307448539.jpg
307449603.jpg
307559997.jpg
307561611.jpg
307610828.jpg
307610829.jpg
307625317.jpg
307674375.jpg
307679934.jpg
307681398.jpg
307752124.jpg
307951786.jpg
308023162.jpg
308035740.jpg
308125208.jpg
308211140.jpg
308228479.jpg
308234980.jpg
308235002.jpg
308247423.jpg
308247460.jpg
308277545.jpg
308285973.jpg
